From 48d0dcbd5b7f80810ce259bc9ed6f57f99e27ca9 Mon Sep 17 00:00:00 2001
From: marha <marha@users.sourceforge.net>
Date: Wed, 16 Feb 2011 16:53:37 +0000
Subject: pixman mesa git update 16 Feb 2011

---
 mesalib/src/glsl/ir_constant_expression.cpp        |   58 +-
 mesalib/src/glsl/linker.cpp                        |    2 +-
 mesalib/src/mesa/main/mtypes.h                     | 6739 ++++++++++----------
 mesalib/src/mesa/main/state.c                      | 1466 ++---
 mesalib/src/mesa/state_tracker/st_atom_blend.c     |  601 +-
 mesalib/src/mesa/state_tracker/st_cb_bitmap.c      | 1783 +++---
 .../src/mesa/state_tracker/st_cb_bufferobjects.c   |  915 +--
 mesalib/src/mesa/state_tracker/st_cb_clear.c       | 1122 ++--
 mesalib/src/mesa/state_tracker/st_cb_drawpixels.c  | 2739 ++++----
 mesalib/src/mesa/state_tracker/st_cb_drawtex.c     |  611 +-
 mesalib/src/mesa/state_tracker/st_context.c        |    5 +
 mesalib/src/mesa/state_tracker/st_context.h        |  535 +-
 mesalib/src/mesa/state_tracker/st_draw.c           | 1537 +++--
 mesalib/src/mesa/state_tracker/st_draw_feedback.c  |    1 -
 mesalib/src/mesa/state_tracker/st_gen_mipmap.c     |  846 ++-
 mesalib/src/mesa/tnl/t_draw.c                      | 1028 +--
 mesalib/src/mesa/vbo/vbo_exec_array.c              | 2559 ++++----
 mesalib/src/mesa/vbo/vbo_exec_draw.c               |  841 +--
 mesalib/src/mesa/vbo/vbo_save_draw.c               |  609 +-
 pixman/Makefile.am                                 |    2 +-
 pixman/configure.ac                                |    1 +
 pixman/demos/Makefile.am                           |   34 +
 pixman/demos/alpha-test.c                          |  117 +
 pixman/demos/clip-in.c                             |   50 +
 pixman/demos/clip-test.c                           |   97 +
 pixman/demos/composite-test.c                      |  191 +
 pixman/demos/convolution-test.c                    |   47 +
 pixman/demos/gradient-test.c                       |   89 +
 pixman/demos/gtk-utils.c                           |  115 +
 pixman/demos/gtk-utils.h                           |   13 +
 pixman/demos/radial-test.c                         |  198 +
 pixman/demos/screen-test.c                         |   44 +
 pixman/demos/trap-test.c                           |   49 +
 pixman/pixman/pixman-arm-common.h                  |   59 +-
 pixman/pixman/pixman-arm-neon-asm.S                | 4758 +++++++-------
 pixman/pixman/pixman-arm-neon.c                    |   11 +
 pixman/pixman/pixman-fast-path.c                   | 4455 ++++++-------
 pixman/pixman/pixman-fast-path.h                   | 1039 +--
 pixman/pixman/pixman-sse2.c                        |  124 +-
 pixman/test/Makefile.am                            |  104 +-
 pixman/test/alpha-test.c                           |  117 -
 pixman/test/clip-in.c                              |   50 -
 pixman/test/clip-test.c                            |   97 -
 pixman/test/composite-test.c                       |  191 -
 pixman/test/convolution-test.c                     |   47 -
 pixman/test/gradient-test.c                        |   89 -
 pixman/test/gtk-utils.c                            |  115 -
 pixman/test/gtk-utils.h                            |   13 -
 pixman/test/radial-test.c                          |  198 -
 pixman/test/scaling-test.c                         |  618 +-
 pixman/test/screen-test.c                          |   44 -
 pixman/test/trap-test.c                            |   49 -
 52 files changed, 18838 insertions(+), 18384 deletions(-)
 create mode 100644 pixman/demos/Makefile.am
 create mode 100644 pixman/demos/alpha-test.c
 create mode 100644 pixman/demos/clip-in.c
 create mode 100644 pixman/demos/clip-test.c
 create mode 100644 pixman/demos/composite-test.c
 create mode 100644 pixman/demos/convolution-test.c
 create mode 100644 pixman/demos/gradient-test.c
 create mode 100644 pixman/demos/gtk-utils.c
 create mode 100644 pixman/demos/gtk-utils.h
 create mode 100644 pixman/demos/radial-test.c
 create mode 100644 pixman/demos/screen-test.c
 create mode 100644 pixman/demos/trap-test.c
 delete mode 100644 pixman/test/alpha-test.c
 delete mode 100644 pixman/test/clip-in.c
 delete mode 100644 pixman/test/clip-test.c
 delete mode 100644 pixman/test/composite-test.c
 delete mode 100644 pixman/test/convolution-test.c
 delete mode 100644 pixman/test/gradient-test.c
 delete mode 100644 pixman/test/gtk-utils.c
 delete mode 100644 pixman/test/gtk-utils.h
 delete mode 100644 pixman/test/radial-test.c
 delete mode 100644 pixman/test/screen-test.c
 delete mode 100644 pixman/test/trap-test.c

diff --git a/mesalib/src/glsl/ir_constant_expression.cpp b/mesalib/src/glsl/ir_constant_expression.cpp
index 2841fb350..2a3084896 100644
--- a/mesalib/src/glsl/ir_constant_expression.cpp
+++ b/mesalib/src/glsl/ir_constant_expression.cpp
@@ -288,24 +288,20 @@ ir_expression::constant_expression_value()
       break;
 
    case ir_unop_rcp:
-      /* FINISHME: Emit warning when division-by-zero is detected. */
       assert(op[0]->type->base_type == GLSL_TYPE_FLOAT);
       for (unsigned c = 0; c < op[0]->type->components(); c++) {
 	 switch (this->type->base_type) {
 	 case GLSL_TYPE_UINT:
-	    if (op[0]->value.u[c] == 0.0)
-	       return NULL;
-	    data.u[c] = 1 / op[0]->value.u[c];
+	    if (op[0]->value.u[c] != 0.0)
+	       data.u[c] = 1 / op[0]->value.u[c];
 	    break;
 	 case GLSL_TYPE_INT:
-	    if (op[0]->value.i[c] == 0.0)
-	       return NULL;
-	    data.i[c] = 1 / op[0]->value.i[c];
+	    if (op[0]->value.i[c] != 0.0)
+	       data.i[c] = 1 / op[0]->value.i[c];
 	    break;
 	 case GLSL_TYPE_FLOAT:
-	    if (op[0]->value.f[c] == 0.0)
-	       return NULL;
-	    data.f[c] = 1.0F / op[0]->value.f[c];
+	    if (op[0]->value.f[c] != 0.0)
+	       data.f[c] = 1.0F / op[0]->value.f[c];
 	    break;
 	 default:
 	    assert(0);
@@ -314,13 +310,9 @@ ir_expression::constant_expression_value()
       break;
 
    case ir_unop_rsq:
-      /* FINISHME: Emit warning when division-by-zero is detected. */
       assert(op[0]->type->base_type == GLSL_TYPE_FLOAT);
       for (unsigned c = 0; c < op[0]->type->components(); c++) {
-	 float s = sqrtf(op[0]->value.f[c]);
-	 if (s == 0)
-	    return NULL;
-	 data.f[c] = 1.0F / s;
+	 data.f[c] = 1.0F / sqrtf(op[0]->value.f[c]);
       }
       break;
 
@@ -523,18 +515,20 @@ ir_expression::constant_expression_value()
 
 	 switch (op[0]->type->base_type) {
 	 case GLSL_TYPE_UINT:
-	    if (op[1]->value.u[c1] == 0)
-	       return NULL;
-	    data.u[c] = op[0]->value.u[c0] / op[1]->value.u[c1];
+	    if (op[1]->value.u[c1] == 0) {
+	       data.u[c] = 0;
+	    } else {
+	       data.u[c] = op[0]->value.u[c0] / op[1]->value.u[c1];
+	    }
 	    break;
 	 case GLSL_TYPE_INT:
-	    if (op[1]->value.i[c1] == 0)
-	       return NULL;
-	    data.i[c] = op[0]->value.i[c0] / op[1]->value.i[c1];
+	    if (op[1]->value.i[c1] == 0) {
+	       data.i[c] = 0;
+	    } else {
+	       data.i[c] = op[0]->value.i[c0] / op[1]->value.i[c1];
+	    }
 	    break;
 	 case GLSL_TYPE_FLOAT:
-	    if (op[1]->value.f[c1] == 0)
-	       return NULL;
 	    data.f[c] = op[0]->value.f[c0] / op[1]->value.f[c1];
 	    break;
 	 default:
@@ -552,18 +546,20 @@ ir_expression::constant_expression_value()
 
 	 switch (op[0]->type->base_type) {
 	 case GLSL_TYPE_UINT:
-	    if (op[1]->value.u[c1] == 0)
-	       return NULL;
-	    data.u[c] = op[0]->value.u[c0] % op[1]->value.u[c1];
+	    if (op[1]->value.u[c1] == 0) {
+	       data.u[c] = 0;
+	    } else {
+	       data.u[c] = op[0]->value.u[c0] % op[1]->value.u[c1];
+	    }
 	    break;
 	 case GLSL_TYPE_INT:
-	    if (op[1]->value.i[c1] == 0)
-	       return NULL;
-	    data.i[c] = op[0]->value.i[c0] % op[1]->value.i[c1];
+	    if (op[1]->value.i[c1] == 0) {
+	       data.i[c] = 0;
+	    } else {
+	       data.i[c] = op[0]->value.i[c0] % op[1]->value.i[c1];
+	    }
 	    break;
 	 case GLSL_TYPE_FLOAT:
-	    if (op[1]->value.f[c1] == 0)
-	       return NULL;
 	    /* We don't use fmod because it rounds toward zero; GLSL specifies
 	     * the use of floor.
 	     */
diff --git a/mesalib/src/glsl/linker.cpp b/mesalib/src/glsl/linker.cpp
index 46cd1950c..6c003bb02 100644
--- a/mesalib/src/glsl/linker.cpp
+++ b/mesalib/src/glsl/linker.cpp
@@ -926,7 +926,7 @@ link_intrastage_shaders(void *mem_ctx,
 	    if (var->type->is_array() && (var->type->length == 0)) {
 	       const glsl_type *type =
 		  glsl_type::get_array_instance(var->type->fields.array,
-						var->max_array_access);
+						var->max_array_access + 1);
 
 	       assert(type != NULL);
 	       var->type = type;
diff --git a/mesalib/src/mesa/main/mtypes.h b/mesalib/src/mesa/main/mtypes.h
index 020595bd0..b5966dffe 100644
--- a/mesalib/src/mesa/main/mtypes.h
+++ b/mesalib/src/mesa/main/mtypes.h
@@ -1,3369 +1,3370 @@
-/*
- * Mesa 3-D graphics library
- * Version:  7.7
- *
- * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
- * Copyright (C) 2009  VMware, Inc.  All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * \file mtypes.h
- * Main Mesa data structures.
- *
- * Please try to mark derived values with a leading underscore ('_').
- */
-
-#ifndef MTYPES_H
-#define MTYPES_H
-
-
-#include "main/glheader.h"
-#include "main/config.h"
-#include "main/mfeatures.h"
-#include "glapi/glapi.h"
-#include "math/m_matrix.h"	/* GLmatrix */
-#include "main/simple_list.h"	/* struct simple_node */
-#include "main/formats.h"       /* MESA_FORMAT_COUNT */
-
-
-/**
- * Color channel data type.
- */
-#if CHAN_BITS == 8
-   typedef GLubyte GLchan;
-#define CHAN_MAX 255
-#define CHAN_MAXF 255.0F
-#define CHAN_TYPE GL_UNSIGNED_BYTE
-#elif CHAN_BITS == 16
-   typedef GLushort GLchan;
-#define CHAN_MAX 65535
-#define CHAN_MAXF 65535.0F
-#define CHAN_TYPE GL_UNSIGNED_SHORT
-#elif CHAN_BITS == 32
-   typedef GLfloat GLchan;
-#define CHAN_MAX 1.0
-#define CHAN_MAXF 1.0F
-#define CHAN_TYPE GL_FLOAT
-#else
-#error "illegal number of color channel bits"
-#endif
-
-
-/**
- * Stencil buffer data type.
- */
-#if STENCIL_BITS==8
-   typedef GLubyte GLstencil;
-#elif STENCIL_BITS==16
-   typedef GLushort GLstencil;
-#else
-#  error "illegal number of stencil bits"
-#endif
-
-
-/**
- * \name 64-bit extension of GLbitfield.
- */
-/*@{*/
-typedef GLuint64 GLbitfield64;
-
-/** Set a single bit */
-#define BITFIELD64_BIT(b)      (1ULL << (b))
-
-
-/**
- * \name Some forward type declarations
- */
-/*@{*/
-struct _mesa_HashTable;
-struct gl_attrib_node;
-struct gl_list_extensions;
-struct gl_meta_state;
-struct gl_pixelstore_attrib;
-struct gl_program_cache;
-struct gl_texture_format;
-struct gl_texture_image;
-struct gl_texture_object;
-struct gl_context;
-struct st_context;
-/*@}*/
-
-
-/** Extra draw modes beyond GL_POINTS, GL_TRIANGLE_FAN, etc */
-#define PRIM_OUTSIDE_BEGIN_END   (GL_POLYGON+1)
-#define PRIM_INSIDE_UNKNOWN_PRIM (GL_POLYGON+2)
-#define PRIM_UNKNOWN             (GL_POLYGON+3)
-
-
-/**
- * Shader stages. Note that these will become 5 with tessellation.
- * These MUST have the same values as gallium's PIPE_SHADER_*
- */
-typedef enum
-{
-   MESA_SHADER_VERTEX = 0,
-   MESA_SHADER_FRAGMENT = 1,
-   MESA_SHADER_GEOMETRY = 2,
-   MESA_SHADER_TYPES = 3
-} gl_shader_type;
-
-
-
-/**
- * Indexes for vertex program attributes.
- * GL_NV_vertex_program aliases generic attributes over the conventional
- * attributes.  In GL_ARB_vertex_program shader the aliasing is optional.
- * In GL_ARB_vertex_shader / OpenGL 2.0 the aliasing is disallowed (the
- * generic attributes are distinct/separate).
- */
-typedef enum
-{
-   VERT_ATTRIB_POS = 0,
-   VERT_ATTRIB_WEIGHT = 1,
-   VERT_ATTRIB_NORMAL = 2,
-   VERT_ATTRIB_COLOR0 = 3,
-   VERT_ATTRIB_COLOR1 = 4,
-   VERT_ATTRIB_FOG = 5,
-   VERT_ATTRIB_COLOR_INDEX = 6,
-   VERT_ATTRIB_POINT_SIZE = 6,  /*alias*/
-   VERT_ATTRIB_EDGEFLAG = 7,
-   VERT_ATTRIB_TEX0 = 8,
-   VERT_ATTRIB_TEX1 = 9,
-   VERT_ATTRIB_TEX2 = 10,
-   VERT_ATTRIB_TEX3 = 11,
-   VERT_ATTRIB_TEX4 = 12,
-   VERT_ATTRIB_TEX5 = 13,
-   VERT_ATTRIB_TEX6 = 14,
-   VERT_ATTRIB_TEX7 = 15,
-   VERT_ATTRIB_GENERIC0 = 16,
-   VERT_ATTRIB_GENERIC1 = 17,
-   VERT_ATTRIB_GENERIC2 = 18,
-   VERT_ATTRIB_GENERIC3 = 19,
-   VERT_ATTRIB_GENERIC4 = 20,
-   VERT_ATTRIB_GENERIC5 = 21,
-   VERT_ATTRIB_GENERIC6 = 22,
-   VERT_ATTRIB_GENERIC7 = 23,
-   VERT_ATTRIB_GENERIC8 = 24,
-   VERT_ATTRIB_GENERIC9 = 25,
-   VERT_ATTRIB_GENERIC10 = 26,
-   VERT_ATTRIB_GENERIC11 = 27,
-   VERT_ATTRIB_GENERIC12 = 28,
-   VERT_ATTRIB_GENERIC13 = 29,
-   VERT_ATTRIB_GENERIC14 = 30,
-   VERT_ATTRIB_GENERIC15 = 31,
-   VERT_ATTRIB_MAX = 32
-} gl_vert_attrib;
-
-/**
- * Bitflags for vertex attributes.
- * These are used in bitfields in many places.
- */
-/*@{*/
-#define VERT_BIT_POS         (1 << VERT_ATTRIB_POS)
-#define VERT_BIT_WEIGHT      (1 << VERT_ATTRIB_WEIGHT)
-#define VERT_BIT_NORMAL      (1 << VERT_ATTRIB_NORMAL)
-#define VERT_BIT_COLOR0      (1 << VERT_ATTRIB_COLOR0)
-#define VERT_BIT_COLOR1      (1 << VERT_ATTRIB_COLOR1)
-#define VERT_BIT_FOG         (1 << VERT_ATTRIB_FOG)
-#define VERT_BIT_COLOR_INDEX (1 << VERT_ATTRIB_COLOR_INDEX)
-#define VERT_BIT_EDGEFLAG    (1 << VERT_ATTRIB_EDGEFLAG)
-#define VERT_BIT_TEX0        (1 << VERT_ATTRIB_TEX0)
-#define VERT_BIT_TEX1        (1 << VERT_ATTRIB_TEX1)
-#define VERT_BIT_TEX2        (1 << VERT_ATTRIB_TEX2)
-#define VERT_BIT_TEX3        (1 << VERT_ATTRIB_TEX3)
-#define VERT_BIT_TEX4        (1 << VERT_ATTRIB_TEX4)
-#define VERT_BIT_TEX5        (1 << VERT_ATTRIB_TEX5)
-#define VERT_BIT_TEX6        (1 << VERT_ATTRIB_TEX6)
-#define VERT_BIT_TEX7        (1 << VERT_ATTRIB_TEX7)
-#define VERT_BIT_GENERIC0    (1 << VERT_ATTRIB_GENERIC0)
-#define VERT_BIT_GENERIC1    (1 << VERT_ATTRIB_GENERIC1)
-#define VERT_BIT_GENERIC2    (1 << VERT_ATTRIB_GENERIC2)
-#define VERT_BIT_GENERIC3    (1 << VERT_ATTRIB_GENERIC3)
-#define VERT_BIT_GENERIC4    (1 << VERT_ATTRIB_GENERIC4)
-#define VERT_BIT_GENERIC5    (1 << VERT_ATTRIB_GENERIC5)
-#define VERT_BIT_GENERIC6    (1 << VERT_ATTRIB_GENERIC6)
-#define VERT_BIT_GENERIC7    (1 << VERT_ATTRIB_GENERIC7)
-#define VERT_BIT_GENERIC8    (1 << VERT_ATTRIB_GENERIC8)
-#define VERT_BIT_GENERIC9    (1 << VERT_ATTRIB_GENERIC9)
-#define VERT_BIT_GENERIC10   (1 << VERT_ATTRIB_GENERIC10)
-#define VERT_BIT_GENERIC11   (1 << VERT_ATTRIB_GENERIC11)
-#define VERT_BIT_GENERIC12   (1 << VERT_ATTRIB_GENERIC12)
-#define VERT_BIT_GENERIC13   (1 << VERT_ATTRIB_GENERIC13)
-#define VERT_BIT_GENERIC14   (1 << VERT_ATTRIB_GENERIC14)
-#define VERT_BIT_GENERIC15   (1 << VERT_ATTRIB_GENERIC15)
-
-#define VERT_BIT_TEX(u)  (1 << (VERT_ATTRIB_TEX0 + (u)))
-#define VERT_BIT_GENERIC(g)  (1 << (VERT_ATTRIB_GENERIC0 + (g)))
-/*@}*/
-
-
-/**
- * Indexes for vertex program result attributes
- */
-typedef enum
-{
-   VERT_RESULT_HPOS = 0,
-   VERT_RESULT_COL0 = 1,
-   VERT_RESULT_COL1 = 2,
-   VERT_RESULT_FOGC = 3,
-   VERT_RESULT_TEX0 = 4,
-   VERT_RESULT_TEX1 = 5,
-   VERT_RESULT_TEX2 = 6,
-   VERT_RESULT_TEX3 = 7,
-   VERT_RESULT_TEX4 = 8,
-   VERT_RESULT_TEX5 = 9,
-   VERT_RESULT_TEX6 = 10,
-   VERT_RESULT_TEX7 = 11,
-   VERT_RESULT_PSIZ = 12,
-   VERT_RESULT_BFC0 = 13,
-   VERT_RESULT_BFC1 = 14,
-   VERT_RESULT_EDGE = 15,
-   VERT_RESULT_VAR0 = 16,  /**< shader varying */
-   VERT_RESULT_MAX = (VERT_RESULT_VAR0 + MAX_VARYING)
-} gl_vert_result;
-
-
-/*********************************************/
-
-/**
- * Indexes for geometry program attributes.
- */
-typedef enum
-{
-   GEOM_ATTRIB_POSITION = 0,
-   GEOM_ATTRIB_COLOR0 = 1,
-   GEOM_ATTRIB_COLOR1 = 2,
-   GEOM_ATTRIB_SECONDARY_COLOR0 = 3,
-   GEOM_ATTRIB_SECONDARY_COLOR1 = 4,
-   GEOM_ATTRIB_FOG_FRAG_COORD = 5,
-   GEOM_ATTRIB_POINT_SIZE = 6,
-   GEOM_ATTRIB_CLIP_VERTEX = 7,
-   GEOM_ATTRIB_PRIMITIVE_ID = 8,
-   GEOM_ATTRIB_TEX_COORD = 9,
-
-   GEOM_ATTRIB_VAR0 = 16,
-   GEOM_ATTRIB_MAX = (GEOM_ATTRIB_VAR0 + MAX_VARYING)
-} gl_geom_attrib;
-
-/**
- * Bitflags for geometry attributes.
- * These are used in bitfields in many places.
- */
-/*@{*/
-#define GEOM_BIT_COLOR0      (1 << GEOM_ATTRIB_COLOR0)
-#define GEOM_BIT_COLOR1      (1 << GEOM_ATTRIB_COLOR1)
-#define GEOM_BIT_SCOLOR0     (1 << GEOM_ATTRIB_SECONDARY_COLOR0)
-#define GEOM_BIT_SCOLOR1     (1 << GEOM_ATTRIB_SECONDARY_COLOR1)
-#define GEOM_BIT_TEX_COORD   (1 << GEOM_ATTRIB_TEX_COORD)
-#define GEOM_BIT_FOG_COORD   (1 << GEOM_ATTRIB_FOG_FRAG_COORD)
-#define GEOM_BIT_POSITION    (1 << GEOM_ATTRIB_POSITION)
-#define GEOM_BIT_POINT_SIDE  (1 << GEOM_ATTRIB_POINT_SIZE)
-#define GEOM_BIT_CLIP_VERTEX (1 << GEOM_ATTRIB_CLIP_VERTEX)
-#define GEOM_BIT_PRIM_ID     (1 << GEOM_ATTRIB_PRIMITIVE_ID)
-#define GEOM_BIT_VAR0        (1 << GEOM_ATTRIB_VAR0)
-
-#define GEOM_BIT_VAR(g)  (1 << (GEOM_BIT_VAR0 + (g)))
-/*@}*/
-
-
-/**
- * Indexes for geometry program result attributes
- */
-typedef enum
-{
-   GEOM_RESULT_POS  = 0,
-   GEOM_RESULT_COL0  = 1,
-   GEOM_RESULT_COL1  = 2,
-   GEOM_RESULT_SCOL0 = 3,
-   GEOM_RESULT_SCOL1 = 4,
-   GEOM_RESULT_FOGC = 5,
-   GEOM_RESULT_TEX0 = 6,
-   GEOM_RESULT_TEX1 = 7,
-   GEOM_RESULT_TEX2 = 8,
-   GEOM_RESULT_TEX3 = 9,
-   GEOM_RESULT_TEX4 = 10,
-   GEOM_RESULT_TEX5 = 11,
-   GEOM_RESULT_TEX6 = 12,
-   GEOM_RESULT_TEX7 = 13,
-   GEOM_RESULT_PSIZ = 14,
-   GEOM_RESULT_CLPV = 15,
-   GEOM_RESULT_PRID = 16,
-   GEOM_RESULT_LAYR = 17,
-   GEOM_RESULT_VAR0 = 18,  /**< shader varying, should really be 16 */
-   /* ### we need to -2 because var0 is 18 instead 16 like in the others */
-   GEOM_RESULT_MAX  =  (GEOM_RESULT_VAR0 + MAX_VARYING - 2)
-} gl_geom_result;
-
-
-/**
- * Indexes for fragment program input attributes.
- */
-typedef enum
-{
-   FRAG_ATTRIB_WPOS = 0,
-   FRAG_ATTRIB_COL0 = 1,
-   FRAG_ATTRIB_COL1 = 2,
-   FRAG_ATTRIB_FOGC = 3,
-   FRAG_ATTRIB_TEX0 = 4,
-   FRAG_ATTRIB_TEX1 = 5,
-   FRAG_ATTRIB_TEX2 = 6,
-   FRAG_ATTRIB_TEX3 = 7,
-   FRAG_ATTRIB_TEX4 = 8,
-   FRAG_ATTRIB_TEX5 = 9,
-   FRAG_ATTRIB_TEX6 = 10,
-   FRAG_ATTRIB_TEX7 = 11,
-   FRAG_ATTRIB_FACE = 12,  /**< front/back face */
-   FRAG_ATTRIB_PNTC = 13,  /**< sprite/point coord */
-   FRAG_ATTRIB_VAR0 = 14,  /**< shader varying */
-   FRAG_ATTRIB_MAX = (FRAG_ATTRIB_VAR0 + MAX_VARYING)
-} gl_frag_attrib;
-
-/**
- * Bitflags for fragment program input attributes.
- */
-/*@{*/
-#define FRAG_BIT_WPOS  (1 << FRAG_ATTRIB_WPOS)
-#define FRAG_BIT_COL0  (1 << FRAG_ATTRIB_COL0)
-#define FRAG_BIT_COL1  (1 << FRAG_ATTRIB_COL1)
-#define FRAG_BIT_FOGC  (1 << FRAG_ATTRIB_FOGC)
-#define FRAG_BIT_FACE  (1 << FRAG_ATTRIB_FACE)
-#define FRAG_BIT_PNTC  (1 << FRAG_ATTRIB_PNTC)
-#define FRAG_BIT_TEX0  (1 << FRAG_ATTRIB_TEX0)
-#define FRAG_BIT_TEX1  (1 << FRAG_ATTRIB_TEX1)
-#define FRAG_BIT_TEX2  (1 << FRAG_ATTRIB_TEX2)
-#define FRAG_BIT_TEX3  (1 << FRAG_ATTRIB_TEX3)
-#define FRAG_BIT_TEX4  (1 << FRAG_ATTRIB_TEX4)
-#define FRAG_BIT_TEX5  (1 << FRAG_ATTRIB_TEX5)
-#define FRAG_BIT_TEX6  (1 << FRAG_ATTRIB_TEX6)
-#define FRAG_BIT_TEX7  (1 << FRAG_ATTRIB_TEX7)
-#define FRAG_BIT_VAR0  (1 << FRAG_ATTRIB_VAR0)
-
-#define FRAG_BIT_TEX(U)  (FRAG_BIT_TEX0 << (U))
-#define FRAG_BIT_VAR(V)  (FRAG_BIT_VAR0 << (V))
-
-#define FRAG_BITS_TEX_ANY (FRAG_BIT_TEX0|	\
-			   FRAG_BIT_TEX1|	\
-			   FRAG_BIT_TEX2|	\
-			   FRAG_BIT_TEX3|	\
-			   FRAG_BIT_TEX4|	\
-			   FRAG_BIT_TEX5|	\
-			   FRAG_BIT_TEX6|	\
-			   FRAG_BIT_TEX7)
-/*@}*/
-
-
-/**
- * Fragment program results
- */
-typedef enum
-{
-   FRAG_RESULT_DEPTH = 0,
-   FRAG_RESULT_STENCIL = 1,
-   FRAG_RESULT_COLOR = 2,
-   FRAG_RESULT_DATA0 = 3,
-   FRAG_RESULT_MAX = (FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS)
-} gl_frag_result;
-
-
-/**
- * Indexes for all renderbuffers
- */
-typedef enum
-{
-   /* the four standard color buffers */
-   BUFFER_FRONT_LEFT,
-   BUFFER_BACK_LEFT,
-   BUFFER_FRONT_RIGHT,
-   BUFFER_BACK_RIGHT,
-   BUFFER_DEPTH,
-   BUFFER_STENCIL,
-   BUFFER_ACCUM,
-   /* optional aux buffer */
-   BUFFER_AUX0,
-   /* generic renderbuffers */
-   BUFFER_COLOR0,
-   BUFFER_COLOR1,
-   BUFFER_COLOR2,
-   BUFFER_COLOR3,
-   BUFFER_COLOR4,
-   BUFFER_COLOR5,
-   BUFFER_COLOR6,
-   BUFFER_COLOR7,
-   BUFFER_COUNT
-} gl_buffer_index;
-
-/**
- * Bit flags for all renderbuffers
- */
-#define BUFFER_BIT_FRONT_LEFT   (1 << BUFFER_FRONT_LEFT)
-#define BUFFER_BIT_BACK_LEFT    (1 << BUFFER_BACK_LEFT)
-#define BUFFER_BIT_FRONT_RIGHT  (1 << BUFFER_FRONT_RIGHT)
-#define BUFFER_BIT_BACK_RIGHT   (1 << BUFFER_BACK_RIGHT)
-#define BUFFER_BIT_AUX0         (1 << BUFFER_AUX0)
-#define BUFFER_BIT_AUX1         (1 << BUFFER_AUX1)
-#define BUFFER_BIT_AUX2         (1 << BUFFER_AUX2)
-#define BUFFER_BIT_AUX3         (1 << BUFFER_AUX3)
-#define BUFFER_BIT_DEPTH        (1 << BUFFER_DEPTH)
-#define BUFFER_BIT_STENCIL      (1 << BUFFER_STENCIL)
-#define BUFFER_BIT_ACCUM        (1 << BUFFER_ACCUM)
-#define BUFFER_BIT_COLOR0       (1 << BUFFER_COLOR0)
-#define BUFFER_BIT_COLOR1       (1 << BUFFER_COLOR1)
-#define BUFFER_BIT_COLOR2       (1 << BUFFER_COLOR2)
-#define BUFFER_BIT_COLOR3       (1 << BUFFER_COLOR3)
-#define BUFFER_BIT_COLOR4       (1 << BUFFER_COLOR4)
-#define BUFFER_BIT_COLOR5       (1 << BUFFER_COLOR5)
-#define BUFFER_BIT_COLOR6       (1 << BUFFER_COLOR6)
-#define BUFFER_BIT_COLOR7       (1 << BUFFER_COLOR7)
-
-/**
- * Mask of all the color buffer bits (but not accum).
- */
-#define BUFFER_BITS_COLOR  (BUFFER_BIT_FRONT_LEFT | \
-                            BUFFER_BIT_BACK_LEFT | \
-                            BUFFER_BIT_FRONT_RIGHT | \
-                            BUFFER_BIT_BACK_RIGHT | \
-                            BUFFER_BIT_AUX0 | \
-                            BUFFER_BIT_COLOR0 | \
-                            BUFFER_BIT_COLOR1 | \
-                            BUFFER_BIT_COLOR2 | \
-                            BUFFER_BIT_COLOR3 | \
-                            BUFFER_BIT_COLOR4 | \
-                            BUFFER_BIT_COLOR5 | \
-                            BUFFER_BIT_COLOR6 | \
-                            BUFFER_BIT_COLOR7)
-
-
-/**
- * Framebuffer configuration (aka visual / pixelformat)
- * Note: some of these fields should be boolean, but it appears that
- * code in drivers/dri/common/util.c requires int-sized fields.
- */
-struct gl_config
-{
-   GLboolean rgbMode;
-   GLboolean floatMode;
-   GLboolean colorIndexMode;  /* XXX is this used anywhere? */
-   GLuint doubleBufferMode;
-   GLuint stereoMode;
-
-   GLboolean haveAccumBuffer;
-   GLboolean haveDepthBuffer;
-   GLboolean haveStencilBuffer;
-
-   GLint redBits, greenBits, blueBits, alphaBits;	/* bits per comp */
-   GLuint redMask, greenMask, blueMask, alphaMask;
-   GLint rgbBits;		/* total bits for rgb */
-   GLint indexBits;		/* total bits for colorindex */
-
-   GLint accumRedBits, accumGreenBits, accumBlueBits, accumAlphaBits;
-   GLint depthBits;
-   GLint stencilBits;
-
-   GLint numAuxBuffers;
-
-   GLint level;
-
-   /* EXT_visual_rating / GLX 1.2 */
-   GLint visualRating;
-
-   /* EXT_visual_info / GLX 1.2 */
-   GLint transparentPixel;
-   /*    colors are floats scaled to ints */
-   GLint transparentRed, transparentGreen, transparentBlue, transparentAlpha;
-   GLint transparentIndex;
-
-   /* ARB_multisample / SGIS_multisample */
-   GLint sampleBuffers;
-   GLint samples;
-
-   /* SGIX_pbuffer / GLX 1.3 */
-   GLint maxPbufferWidth;
-   GLint maxPbufferHeight;
-   GLint maxPbufferPixels;
-   GLint optimalPbufferWidth;   /* Only for SGIX_pbuffer. */
-   GLint optimalPbufferHeight;  /* Only for SGIX_pbuffer. */
-
-   /* OML_swap_method */
-   GLint swapMethod;
-
-   /* EXT_texture_from_pixmap */
-   GLint bindToTextureRgb;
-   GLint bindToTextureRgba;
-   GLint bindToMipmapTexture;
-   GLint bindToTextureTargets;
-   GLint yInverted;
-
-   /* EXT_framebuffer_sRGB */
-   GLint sRGBCapable;
-};
-
-
-/**
- * Data structure for color tables
- */
-struct gl_color_table
-{
-   GLenum InternalFormat;      /**< The user-specified format */
-   GLenum _BaseFormat;         /**< GL_ALPHA, GL_RGBA, GL_RGB, etc */
-   GLuint Size;                /**< number of entries in table */
-   GLfloat *TableF;            /**< Color table, floating point values */
-   GLubyte *TableUB;           /**< Color table, ubyte values */
-   GLubyte RedSize;
-   GLubyte GreenSize;
-   GLubyte BlueSize;
-   GLubyte AlphaSize;
-   GLubyte LuminanceSize;
-   GLubyte IntensitySize;
-};
-
-
-/**
- * \name Bit flags used for updating material values.
- */
-/*@{*/
-#define MAT_ATTRIB_FRONT_AMBIENT           0 
-#define MAT_ATTRIB_BACK_AMBIENT            1
-#define MAT_ATTRIB_FRONT_DIFFUSE           2 
-#define MAT_ATTRIB_BACK_DIFFUSE            3
-#define MAT_ATTRIB_FRONT_SPECULAR          4 
-#define MAT_ATTRIB_BACK_SPECULAR           5
-#define MAT_ATTRIB_FRONT_EMISSION          6
-#define MAT_ATTRIB_BACK_EMISSION           7
-#define MAT_ATTRIB_FRONT_SHININESS         8
-#define MAT_ATTRIB_BACK_SHININESS          9
-#define MAT_ATTRIB_FRONT_INDEXES           10
-#define MAT_ATTRIB_BACK_INDEXES            11
-#define MAT_ATTRIB_MAX                     12
-
-#define MAT_ATTRIB_AMBIENT(f)  (MAT_ATTRIB_FRONT_AMBIENT+(f))  
-#define MAT_ATTRIB_DIFFUSE(f)  (MAT_ATTRIB_FRONT_DIFFUSE+(f))  
-#define MAT_ATTRIB_SPECULAR(f) (MAT_ATTRIB_FRONT_SPECULAR+(f)) 
-#define MAT_ATTRIB_EMISSION(f) (MAT_ATTRIB_FRONT_EMISSION+(f)) 
-#define MAT_ATTRIB_SHININESS(f)(MAT_ATTRIB_FRONT_SHININESS+(f))
-#define MAT_ATTRIB_INDEXES(f)  (MAT_ATTRIB_FRONT_INDEXES+(f))  
-
-#define MAT_INDEX_AMBIENT  0
-#define MAT_INDEX_DIFFUSE  1
-#define MAT_INDEX_SPECULAR 2
-
-#define MAT_BIT_FRONT_AMBIENT         (1<<MAT_ATTRIB_FRONT_AMBIENT)
-#define MAT_BIT_BACK_AMBIENT          (1<<MAT_ATTRIB_BACK_AMBIENT)
-#define MAT_BIT_FRONT_DIFFUSE         (1<<MAT_ATTRIB_FRONT_DIFFUSE)
-#define MAT_BIT_BACK_DIFFUSE          (1<<MAT_ATTRIB_BACK_DIFFUSE)
-#define MAT_BIT_FRONT_SPECULAR        (1<<MAT_ATTRIB_FRONT_SPECULAR)
-#define MAT_BIT_BACK_SPECULAR         (1<<MAT_ATTRIB_BACK_SPECULAR)
-#define MAT_BIT_FRONT_EMISSION        (1<<MAT_ATTRIB_FRONT_EMISSION)
-#define MAT_BIT_BACK_EMISSION         (1<<MAT_ATTRIB_BACK_EMISSION)
-#define MAT_BIT_FRONT_SHININESS       (1<<MAT_ATTRIB_FRONT_SHININESS)
-#define MAT_BIT_BACK_SHININESS        (1<<MAT_ATTRIB_BACK_SHININESS)
-#define MAT_BIT_FRONT_INDEXES         (1<<MAT_ATTRIB_FRONT_INDEXES)
-#define MAT_BIT_BACK_INDEXES          (1<<MAT_ATTRIB_BACK_INDEXES)
-
-
-#define FRONT_MATERIAL_BITS	(MAT_BIT_FRONT_EMISSION | 	\
-				 MAT_BIT_FRONT_AMBIENT |	\
-				 MAT_BIT_FRONT_DIFFUSE | 	\
-				 MAT_BIT_FRONT_SPECULAR |	\
-				 MAT_BIT_FRONT_SHININESS | 	\
-				 MAT_BIT_FRONT_INDEXES)
-
-#define BACK_MATERIAL_BITS	(MAT_BIT_BACK_EMISSION |	\
-				 MAT_BIT_BACK_AMBIENT |		\
-				 MAT_BIT_BACK_DIFFUSE |		\
-				 MAT_BIT_BACK_SPECULAR |	\
-				 MAT_BIT_BACK_SHININESS |	\
-				 MAT_BIT_BACK_INDEXES)
-
-#define ALL_MATERIAL_BITS	(FRONT_MATERIAL_BITS | BACK_MATERIAL_BITS)
-/*@}*/
-
-
-#define EXP_TABLE_SIZE 512	/**< Specular exponent lookup table sizes */
-#define SHINE_TABLE_SIZE 256	/**< Material shininess lookup table sizes */
-
-/**
- * Material shininess lookup table.
- */
-struct gl_shine_tab
-{
-   struct gl_shine_tab *next, *prev;
-   GLfloat tab[SHINE_TABLE_SIZE+1];
-   GLfloat shininess;
-   GLuint refcount;
-};
-
-
-/**
- * Light source state.
- */
-struct gl_light
-{
-   struct gl_light *next;	/**< double linked list with sentinel */
-   struct gl_light *prev;
-
-   GLfloat Ambient[4];		/**< ambient color */
-   GLfloat Diffuse[4];		/**< diffuse color */
-   GLfloat Specular[4];		/**< specular color */
-   GLfloat EyePosition[4];	/**< position in eye coordinates */
-   GLfloat SpotDirection[4];	/**< spotlight direction in eye coordinates */
-   GLfloat SpotExponent;
-   GLfloat SpotCutoff;		/**< in degrees */
-   GLfloat _CosCutoffNeg;	/**< = cos(SpotCutoff) */
-   GLfloat _CosCutoff;		/**< = MAX(0, cos(SpotCutoff)) */
-   GLfloat ConstantAttenuation;
-   GLfloat LinearAttenuation;
-   GLfloat QuadraticAttenuation;
-   GLboolean Enabled;		/**< On/off flag */
-
-   /** 
-    * \name Derived fields
-    */
-   /*@{*/
-   GLbitfield _Flags;		/**< State */
-
-   GLfloat _Position[4];	/**< position in eye/obj coordinates */
-   GLfloat _VP_inf_norm[3];	/**< Norm direction to infinite light */
-   GLfloat _h_inf_norm[3];	/**< Norm( _VP_inf_norm + <0,0,1> ) */
-   GLfloat _NormSpotDirection[4]; /**< normalized spotlight direction */
-   GLfloat _VP_inf_spot_attenuation;
-
-   GLfloat _SpotExpTable[EXP_TABLE_SIZE][2];  /**< to replace a pow() call */
-   GLfloat _MatAmbient[2][3];	/**< material ambient * light ambient */
-   GLfloat _MatDiffuse[2][3];	/**< material diffuse * light diffuse */
-   GLfloat _MatSpecular[2][3];	/**< material spec * light specular */
-   GLfloat _dli;		/**< CI diffuse light intensity */
-   GLfloat _sli;		/**< CI specular light intensity */
-   /*@}*/
-};
-
-
-/**
- * Light model state.
- */
-struct gl_lightmodel
-{
-   GLfloat Ambient[4];		/**< ambient color */
-   GLboolean LocalViewer;	/**< Local (or infinite) view point? */
-   GLboolean TwoSide;		/**< Two (or one) sided lighting? */
-   GLenum ColorControl;		/**< either GL_SINGLE_COLOR
-				 *    or GL_SEPARATE_SPECULAR_COLOR */
-};
-
-
-/**
- * Material state.
- */
-struct gl_material
-{
-   GLfloat Attrib[MAT_ATTRIB_MAX][4];
-};
-
-
-/**
- * Accumulation buffer attribute group (GL_ACCUM_BUFFER_BIT)
- */
-struct gl_accum_attrib
-{
-   GLfloat ClearColor[4];	/**< Accumulation buffer clear color */
-};
-
-
-/**
- * Color buffer attribute group (GL_COLOR_BUFFER_BIT).
- */
-struct gl_colorbuffer_attrib
-{
-   GLuint ClearIndex;			/**< Index to use for glClear */
-   GLclampf ClearColor[4];		/**< Color to use for glClear */
-
-   GLuint IndexMask;			/**< Color index write mask */
-   GLubyte ColorMask[MAX_DRAW_BUFFERS][4];/**< Each flag is 0xff or 0x0 */
-
-   GLenum DrawBuffer[MAX_DRAW_BUFFERS];	/**< Which buffer to draw into */
-
-   /** 
-    * \name alpha testing
-    */
-   /*@{*/
-   GLboolean AlphaEnabled;		/**< Alpha test enabled flag */
-   GLenum AlphaFunc;			/**< Alpha test function */
-   GLclampf AlphaRef;			/**< Alpha reference value */
-   /*@}*/
-
-   /** 
-    * \name Blending
-    */
-   /*@{*/
-   GLbitfield BlendEnabled;		/**< Per-buffer blend enable flags */
-   GLfloat BlendColor[4];		/**< Blending color */
-   struct
-   {
-      GLenum SrcRGB;             /**< RGB blend source term */
-      GLenum DstRGB;             /**< RGB blend dest term */
-      GLenum SrcA;               /**< Alpha blend source term */
-      GLenum DstA;               /**< Alpha blend dest term */
-      GLenum EquationRGB;        /**< GL_ADD, GL_SUBTRACT, etc. */
-      GLenum EquationA;          /**< GL_ADD, GL_SUBTRACT, etc. */
-   } Blend[MAX_DRAW_BUFFERS];
-   /** Are the blend func terms currently different for each buffer/target? */
-   GLboolean _BlendFuncPerBuffer;
-   /** Are the blend equations currently different for each buffer/target? */
-   GLboolean _BlendEquationPerBuffer;
-   /*@}*/
-
-   /** 
-    * \name Logic op
-    */
-   /*@{*/
-   GLenum LogicOp;			/**< Logic operator */
-   GLboolean IndexLogicOpEnabled;	/**< Color index logic op enabled flag */
-   GLboolean ColorLogicOpEnabled;	/**< RGBA logic op enabled flag */
-   GLboolean _LogicOpEnabled;		/**< RGBA logic op + EXT_blend_logic_op enabled flag */
-   /*@}*/
-
-   GLboolean DitherFlag;		/**< Dither enable flag */
-
-   GLenum ClampFragmentColor; /**< GL_TRUE, GL_FALSE or GL_FIXED_ONLY_ARB */
-   GLenum ClampReadColor;     /**< GL_TRUE, GL_FALSE or GL_FIXED_ONLY_ARB */
-
-   GLboolean sRGBEnabled;	/**< Framebuffer sRGB blending/updating requested */
-};
-
-
-/**
- * Current attribute group (GL_CURRENT_BIT).
- */
-struct gl_current_attrib
-{
-   /**
-    * \name Current vertex attributes.
-    * \note Values are valid only after FLUSH_VERTICES has been called.
-    * \note Index and Edgeflag current values are stored as floats in the 
-    * SIX and SEVEN attribute slots.
-    */
-   GLfloat Attrib[VERT_ATTRIB_MAX][4];	/**< Position, color, texcoords, etc */
-
-   /**
-    * \name Current raster position attributes (always valid).
-    * \note This set of attributes is very similar to the SWvertex struct.
-    */
-   /*@{*/
-   GLfloat RasterPos[4];
-   GLfloat RasterDistance;
-   GLfloat RasterColor[4];
-   GLfloat RasterSecondaryColor[4];
-   GLfloat RasterTexCoords[MAX_TEXTURE_COORD_UNITS][4];
-   GLboolean RasterPosValid;
-   /*@}*/
-};
-
-
-/**
- * Depth buffer attribute group (GL_DEPTH_BUFFER_BIT).
- */
-struct gl_depthbuffer_attrib
-{
-   GLenum Func;			/**< Function for depth buffer compare */
-   GLclampd Clear;		/**< Value to clear depth buffer to */
-   GLboolean Test;		/**< Depth buffering enabled flag */
-   GLboolean Mask;		/**< Depth buffer writable? */
-   GLboolean BoundsTest;        /**< GL_EXT_depth_bounds_test */
-   GLfloat BoundsMin, BoundsMax;/**< GL_EXT_depth_bounds_test */
-};
-
-
-/**
- * Evaluator attribute group (GL_EVAL_BIT).
- */
-struct gl_eval_attrib
-{
-   /**
-    * \name Enable bits 
-    */
-   /*@{*/
-   GLboolean Map1Color4;
-   GLboolean Map1Index;
-   GLboolean Map1Normal;
-   GLboolean Map1TextureCoord1;
-   GLboolean Map1TextureCoord2;
-   GLboolean Map1TextureCoord3;
-   GLboolean Map1TextureCoord4;
-   GLboolean Map1Vertex3;
-   GLboolean Map1Vertex4;
-   GLboolean Map1Attrib[16];  /* GL_NV_vertex_program */
-   GLboolean Map2Color4;
-   GLboolean Map2Index;
-   GLboolean Map2Normal;
-   GLboolean Map2TextureCoord1;
-   GLboolean Map2TextureCoord2;
-   GLboolean Map2TextureCoord3;
-   GLboolean Map2TextureCoord4;
-   GLboolean Map2Vertex3;
-   GLboolean Map2Vertex4;
-   GLboolean Map2Attrib[16];  /* GL_NV_vertex_program */
-   GLboolean AutoNormal;
-   /*@}*/
-   
-   /**
-    * \name Map Grid endpoints and divisions and calculated du values
-    */
-   /*@{*/
-   GLint MapGrid1un;
-   GLfloat MapGrid1u1, MapGrid1u2, MapGrid1du;
-   GLint MapGrid2un, MapGrid2vn;
-   GLfloat MapGrid2u1, MapGrid2u2, MapGrid2du;
-   GLfloat MapGrid2v1, MapGrid2v2, MapGrid2dv;
-   /*@}*/
-};
-
-
-/**
- * Fog attribute group (GL_FOG_BIT).
- */
-struct gl_fog_attrib
-{
-   GLboolean Enabled;		/**< Fog enabled flag */
-   GLfloat Color[4];		/**< Fog color */
-   GLfloat Density;		/**< Density >= 0.0 */
-   GLfloat Start;		/**< Start distance in eye coords */
-   GLfloat End;			/**< End distance in eye coords */
-   GLfloat Index;		/**< Fog index */
-   GLenum Mode;			/**< Fog mode */
-   GLboolean ColorSumEnabled;
-   GLenum FogCoordinateSource;  /**< GL_EXT_fog_coord */
-   GLfloat _Scale;		/**< (End == Start) ? 1.0 : 1.0 / (End - Start) */
-};
-
-
-/**
- * \brief Layout qualifiers for gl_FragDepth.
- *
- * Extension AMD_conservative_depth allows gl_FragDepth to be redeclared with
- * a layout qualifier.
- *
- * \see enum ir_depth_layout
- */
-enum gl_frag_depth_layout {
-    FRAG_DEPTH_LAYOUT_NONE, /**< No layout is specified. */
-    FRAG_DEPTH_LAYOUT_ANY,
-    FRAG_DEPTH_LAYOUT_GREATER,
-    FRAG_DEPTH_LAYOUT_LESS,
-    FRAG_DEPTH_LAYOUT_UNCHANGED
-};
-
-
-/** 
- * Hint attribute group (GL_HINT_BIT).
- * 
- * Values are always one of GL_FASTEST, GL_NICEST, or GL_DONT_CARE.
- */
-struct gl_hint_attrib
-{
-   GLenum PerspectiveCorrection;
-   GLenum PointSmooth;
-   GLenum LineSmooth;
-   GLenum PolygonSmooth;
-   GLenum Fog;
-   GLenum ClipVolumeClipping;   /**< GL_EXT_clip_volume_hint */
-   GLenum TextureCompression;   /**< GL_ARB_texture_compression */
-   GLenum GenerateMipmap;       /**< GL_SGIS_generate_mipmap */
-   GLenum FragmentShaderDerivative; /**< GL_ARB_fragment_shader */
-};
-
-/**
- * Light state flags.
- */
-/*@{*/
-#define LIGHT_SPOT         0x1
-#define LIGHT_LOCAL_VIEWER 0x2
-#define LIGHT_POSITIONAL   0x4
-#define LIGHT_NEED_VERTICES (LIGHT_POSITIONAL|LIGHT_LOCAL_VIEWER)
-/*@}*/
-
-
-/**
- * Lighting attribute group (GL_LIGHT_BIT).
- */
-struct gl_light_attrib
-{
-   struct gl_light Light[MAX_LIGHTS];	/**< Array of light sources */
-   struct gl_lightmodel Model;		/**< Lighting model */
-
-   /**
-    * Must flush FLUSH_VERTICES before referencing:
-    */
-   /*@{*/
-   struct gl_material Material; 	/**< Includes front & back values */
-   /*@}*/
-
-   GLboolean Enabled;			/**< Lighting enabled flag */
-   GLenum ShadeModel;			/**< GL_FLAT or GL_SMOOTH */
-   GLenum ProvokingVertex;              /**< GL_EXT_provoking_vertex */
-   GLenum ColorMaterialFace;		/**< GL_FRONT, BACK or FRONT_AND_BACK */
-   GLenum ColorMaterialMode;		/**< GL_AMBIENT, GL_DIFFUSE, etc */
-   GLbitfield ColorMaterialBitmask;	/**< bitmask formed from Face and Mode */
-   GLboolean ColorMaterialEnabled;
-   GLenum ClampVertexColor;
-
-   struct gl_light EnabledList;         /**< List sentinel */
-
-   /** 
-    * Derived state for optimizations: 
-    */
-   /*@{*/
-   GLboolean _NeedEyeCoords;		
-   GLboolean _NeedVertices;		/**< Use fast shader? */
-   GLbitfield _Flags;		        /**< LIGHT_* flags, see above */
-   GLfloat _BaseColor[2][3];
-   /*@}*/
-};
-
-
-/**
- * Line attribute group (GL_LINE_BIT).
- */
-struct gl_line_attrib
-{
-   GLboolean SmoothFlag;	/**< GL_LINE_SMOOTH enabled? */
-   GLboolean StippleFlag;	/**< GL_LINE_STIPPLE enabled? */
-   GLushort StipplePattern;	/**< Stipple pattern */
-   GLint StippleFactor;		/**< Stipple repeat factor */
-   GLfloat Width;		/**< Line width */
-};
-
-
-/**
- * Display list attribute group (GL_LIST_BIT).
- */
-struct gl_list_attrib
-{
-   GLuint ListBase;
-};
-
-
-/**
- * Multisample attribute group (GL_MULTISAMPLE_BIT).
- */
-struct gl_multisample_attrib
-{
-   GLboolean Enabled;
-   GLboolean _Enabled;   /**< true if Enabled and multisample buffer */
-   GLboolean SampleAlphaToCoverage;
-   GLboolean SampleAlphaToOne;
-   GLboolean SampleCoverage;
-   GLfloat SampleCoverageValue;
-   GLboolean SampleCoverageInvert;
-};
-
-
-/**
- * A pixelmap (see glPixelMap)
- */
-struct gl_pixelmap
-{
-   GLint Size;
-   GLfloat Map[MAX_PIXEL_MAP_TABLE];
-   GLubyte Map8[MAX_PIXEL_MAP_TABLE];  /**< converted to 8-bit color */
-};
-
-
-/**
- * Collection of all pixelmaps
- */
-struct gl_pixelmaps
-{
-   struct gl_pixelmap RtoR;  /**< i.e. GL_PIXEL_MAP_R_TO_R */
-   struct gl_pixelmap GtoG;
-   struct gl_pixelmap BtoB;
-   struct gl_pixelmap AtoA;
-   struct gl_pixelmap ItoR;
-   struct gl_pixelmap ItoG;
-   struct gl_pixelmap ItoB;
-   struct gl_pixelmap ItoA;
-   struct gl_pixelmap ItoI;
-   struct gl_pixelmap StoS;
-};
-
-
-/**
- * Pixel attribute group (GL_PIXEL_MODE_BIT).
- */
-struct gl_pixel_attrib
-{
-   GLenum ReadBuffer;		/**< source buffer for glRead/CopyPixels() */
-
-   /*--- Begin Pixel Transfer State ---*/
-   /* Fields are in the order in which they're applied... */
-
-   /** Scale & Bias (index shift, offset) */
-   /*@{*/
-   GLfloat RedBias, RedScale;
-   GLfloat GreenBias, GreenScale;
-   GLfloat BlueBias, BlueScale;
-   GLfloat AlphaBias, AlphaScale;
-   GLfloat DepthBias, DepthScale;
-   GLint IndexShift, IndexOffset;
-   /*@}*/
-
-   /* Pixel Maps */
-   /* Note: actual pixel maps are not part of this attrib group */
-   GLboolean MapColorFlag;
-   GLboolean MapStencilFlag;
-
-   /*--- End Pixel Transfer State ---*/
-
-   /** glPixelZoom */
-   GLfloat ZoomX, ZoomY;
-
-   /** GL_SGI_texture_color_table */
-   GLfloat TextureColorTableScale[4]; /**< RGBA */
-   GLfloat TextureColorTableBias[4];  /**< RGBA */
-};
-
-
-/**
- * Point attribute group (GL_POINT_BIT).
- */
-struct gl_point_attrib
-{
-   GLboolean SmoothFlag;	/**< True if GL_POINT_SMOOTH is enabled */
-   GLfloat Size;		/**< User-specified point size */
-   GLfloat Params[3];		/**< GL_EXT_point_parameters */
-   GLfloat MinSize, MaxSize;	/**< GL_EXT_point_parameters */
-   GLfloat Threshold;		/**< GL_EXT_point_parameters */
-   GLboolean _Attenuated;	/**< True if Params != [1, 0, 0] */
-   GLboolean PointSprite;	/**< GL_NV/ARB_point_sprite */
-   GLboolean CoordReplace[MAX_TEXTURE_COORD_UNITS]; /**< GL_ARB_point_sprite*/
-   GLenum SpriteRMode;		/**< GL_NV_point_sprite (only!) */
-   GLenum SpriteOrigin;		/**< GL_ARB_point_sprite */
-};
-
-
-/**
- * Polygon attribute group (GL_POLYGON_BIT).
- */
-struct gl_polygon_attrib
-{
-   GLenum FrontFace;		/**< Either GL_CW or GL_CCW */
-   GLenum FrontMode;		/**< Either GL_POINT, GL_LINE or GL_FILL */
-   GLenum BackMode;		/**< Either GL_POINT, GL_LINE or GL_FILL */
-   GLboolean _FrontBit;		/**< 0=GL_CCW, 1=GL_CW */
-   GLboolean CullFlag;		/**< Culling on/off flag */
-   GLboolean SmoothFlag;	/**< True if GL_POLYGON_SMOOTH is enabled */
-   GLboolean StippleFlag;	/**< True if GL_POLYGON_STIPPLE is enabled */
-   GLenum CullFaceMode;		/**< Culling mode GL_FRONT or GL_BACK */
-   GLfloat OffsetFactor;	/**< Polygon offset factor, from user */
-   GLfloat OffsetUnits;		/**< Polygon offset units, from user */
-   GLboolean OffsetPoint;	/**< Offset in GL_POINT mode */
-   GLboolean OffsetLine;	/**< Offset in GL_LINE mode */
-   GLboolean OffsetFill;	/**< Offset in GL_FILL mode */
-};
-
-
-/**
- * Scissor attributes (GL_SCISSOR_BIT).
- */
-struct gl_scissor_attrib
-{
-   GLboolean Enabled;		/**< Scissor test enabled? */
-   GLint X, Y;			/**< Lower left corner of box */
-   GLsizei Width, Height;	/**< Size of box */
-};
-
-
-/**
- * Stencil attribute group (GL_STENCIL_BUFFER_BIT).
- *
- * Three sets of stencil data are tracked so that OpenGL 2.0,
- * GL_EXT_stencil_two_side, and GL_ATI_separate_stencil can all be supported
- * simultaneously.  In each of the stencil state arrays, element 0 corresponds
- * to GL_FRONT.  Element 1 corresponds to the OpenGL 2.0 /
- * GL_ATI_separate_stencil GL_BACK state.  Element 2 corresponds to the
- * GL_EXT_stencil_two_side GL_BACK state.
- *
- * The derived value \c _BackFace is either 1 or 2 depending on whether or
- * not GL_STENCIL_TEST_TWO_SIDE_EXT is enabled.
- *
- * The derived value \c _TestTwoSide is set when the front-face and back-face
- * stencil state are different.
- */
-struct gl_stencil_attrib
-{
-   GLboolean Enabled;		/**< Enabled flag */
-   GLboolean TestTwoSide;	/**< GL_EXT_stencil_two_side */
-   GLubyte ActiveFace;		/**< GL_EXT_stencil_two_side (0 or 2) */
-   GLboolean _Enabled;          /**< Enabled and stencil buffer present */
-   GLboolean _TestTwoSide;
-   GLubyte _BackFace;           /**< Current back stencil state (1 or 2) */
-   GLenum Function[3];		/**< Stencil function */
-   GLenum FailFunc[3];		/**< Fail function */
-   GLenum ZPassFunc[3];		/**< Depth buffer pass function */
-   GLenum ZFailFunc[3];		/**< Depth buffer fail function */
-   GLint Ref[3];		/**< Reference value */
-   GLuint ValueMask[3];		/**< Value mask */
-   GLuint WriteMask[3];		/**< Write mask */
-   GLuint Clear;		/**< Clear value */
-};
-
-
-/**
- * An index for each type of texture object.  These correspond to the GL
- * texture target enums, such as GL_TEXTURE_2D, GL_TEXTURE_CUBE_MAP, etc.
- * Note: the order is from highest priority to lowest priority.
- */
-typedef enum
-{
-   TEXTURE_2D_ARRAY_INDEX,
-   TEXTURE_1D_ARRAY_INDEX,
-   TEXTURE_CUBE_INDEX,
-   TEXTURE_3D_INDEX,
-   TEXTURE_RECT_INDEX,
-   TEXTURE_2D_INDEX,
-   TEXTURE_1D_INDEX,
-   NUM_TEXTURE_TARGETS
-} gl_texture_index;
-
-
-/**
- * Bit flags for each type of texture object
- * Used for Texture.Unit[]._ReallyEnabled flags.
- */
-/*@{*/
-#define TEXTURE_2D_ARRAY_BIT (1 << TEXTURE_2D_ARRAY_INDEX)
-#define TEXTURE_1D_ARRAY_BIT (1 << TEXTURE_1D_ARRAY_INDEX)
-#define TEXTURE_CUBE_BIT     (1 << TEXTURE_CUBE_INDEX)
-#define TEXTURE_3D_BIT       (1 << TEXTURE_3D_INDEX)
-#define TEXTURE_RECT_BIT     (1 << TEXTURE_RECT_INDEX)
-#define TEXTURE_2D_BIT       (1 << TEXTURE_2D_INDEX)
-#define TEXTURE_1D_BIT       (1 << TEXTURE_1D_INDEX)
-/*@}*/
-
-
-/**
- * TexGenEnabled flags.
- */
-/*@{*/
-#define S_BIT 1
-#define T_BIT 2
-#define R_BIT 4
-#define Q_BIT 8
-#define STR_BITS (S_BIT | T_BIT | R_BIT)
-/*@}*/
-
-
-/**
- * Bit flag versions of the corresponding GL_ constants.
- */
-/*@{*/
-#define TEXGEN_SPHERE_MAP        0x1
-#define TEXGEN_OBJ_LINEAR        0x2
-#define TEXGEN_EYE_LINEAR        0x4
-#define TEXGEN_REFLECTION_MAP_NV 0x8
-#define TEXGEN_NORMAL_MAP_NV     0x10
-
-#define TEXGEN_NEED_NORMALS      (TEXGEN_SPHERE_MAP        | \
-				  TEXGEN_REFLECTION_MAP_NV | \
-				  TEXGEN_NORMAL_MAP_NV)
-#define TEXGEN_NEED_EYE_COORD    (TEXGEN_SPHERE_MAP        | \
-				  TEXGEN_REFLECTION_MAP_NV | \
-				  TEXGEN_NORMAL_MAP_NV     | \
-				  TEXGEN_EYE_LINEAR)
-/*@}*/
-
-
-
-/** Tex-gen enabled for texture unit? */
-#define ENABLE_TEXGEN(unit) (1 << (unit))
-
-/** Non-identity texture matrix for texture unit? */
-#define ENABLE_TEXMAT(unit) (1 << (unit))
-
-
-/**
- * Texel fetch function prototype.  We use texel fetch functions to
- * extract RGBA, color indexes and depth components out of 1D, 2D and 3D
- * texture images.  These functions help to isolate us from the gritty
- * details of all the various texture image encodings.
- * 
- * \param texImage texture image.
- * \param col texel column.
- * \param row texel row.
- * \param img texel image level/layer.
- * \param texelOut output texel (up to 4 GLchans)
- */
-typedef void (*FetchTexelFuncC)( const struct gl_texture_image *texImage,
-                                 GLint col, GLint row, GLint img,
-                                 GLchan *texelOut );
-
-/**
- * As above, but returns floats.
- * Used for depth component images and for upcoming signed/float
- * texture images.
- */
-typedef void (*FetchTexelFuncF)( const struct gl_texture_image *texImage,
-                                 GLint col, GLint row, GLint img,
-                                 GLfloat *texelOut );
-
-
-typedef void (*StoreTexelFunc)(struct gl_texture_image *texImage,
-                               GLint col, GLint row, GLint img,
-                               const void *texel);
-
-
-/**
- * Texture image state.  Describes the dimensions of a texture image,
- * the texel format and pointers to Texel Fetch functions.
- */
-struct gl_texture_image
-{
-   GLint InternalFormat;	/**< Internal format as given by the user */
-   GLenum _BaseFormat;		/**< Either GL_RGB, GL_RGBA, GL_ALPHA,
-				 *   GL_LUMINANCE, GL_LUMINANCE_ALPHA,
-				 *   GL_INTENSITY, GL_COLOR_INDEX,
-				 *   GL_DEPTH_COMPONENT or GL_DEPTH_STENCIL_EXT
-                                 *   only. Used for choosing TexEnv arithmetic.
-				 */
-   GLuint TexFormat;            /**< The actual format: MESA_FORMAT_x */
-
-   GLuint Border;		/**< 0 or 1 */
-   GLuint Width;		/**< = 2^WidthLog2 + 2*Border */
-   GLuint Height;		/**< = 2^HeightLog2 + 2*Border */
-   GLuint Depth;		/**< = 2^DepthLog2 + 2*Border */
-   GLuint Width2;		/**< = Width - 2*Border */
-   GLuint Height2;		/**< = Height - 2*Border */
-   GLuint Depth2;		/**< = Depth - 2*Border */
-   GLuint WidthLog2;		/**< = log2(Width2) */
-   GLuint HeightLog2;		/**< = log2(Height2) */
-   GLuint DepthLog2;		/**< = log2(Depth2) */
-   GLuint MaxLog2;		/**< = MAX(WidthLog2, HeightLog2) */
-   GLfloat WidthScale;		/**< used for mipmap LOD computation */
-   GLfloat HeightScale;		/**< used for mipmap LOD computation */
-   GLfloat DepthScale;		/**< used for mipmap LOD computation */
-   GLboolean IsClientData;	/**< Data owned by client? */
-   GLboolean _IsPowerOfTwo;	/**< Are all dimensions powers of two? */
-
-   struct gl_texture_object *TexObject;  /**< Pointer back to parent object */
-
-   FetchTexelFuncC FetchTexelc;	/**< GLchan texel fetch function pointer */
-   FetchTexelFuncF FetchTexelf;	/**< Float texel fetch function pointer */
-
-   GLuint RowStride;		/**< Padded width in units of texels */
-   GLuint *ImageOffsets;        /**< if 3D texture: array [Depth] of offsets to
-                                     each 2D slice in 'Data', in texels */
-   GLvoid *Data;		/**< Image data, accessed via FetchTexel() */
-
-   /**
-    * \name For device driver:
-    */
-   /*@{*/
-   void *DriverData;		/**< Arbitrary device driver data */
-   /*@}*/
-};
-
-
-/**
- * Indexes for cube map faces.
- */
-typedef enum
-{
-   FACE_POS_X = 0,
-   FACE_NEG_X = 1,
-   FACE_POS_Y = 2,
-   FACE_NEG_Y = 3,
-   FACE_POS_Z = 4,
-   FACE_NEG_Z = 5,
-   MAX_FACES = 6
-} gl_face_index;
-
-
-/**
- * Texture object state.  Contains the array of mipmap images, border color,
- * wrap modes, filter modes, shadow/texcompare state, and the per-texture
- * color palette.
- */
-struct gl_texture_object
-{
-   _glthread_Mutex Mutex;	/**< for thread safety */
-   GLint RefCount;		/**< reference count */
-   GLuint Name;			/**< the user-visible texture object ID */
-   GLenum Target;               /**< GL_TEXTURE_1D, GL_TEXTURE_2D, etc. */
-   GLfloat Priority;		/**< in [0,1] */
-   union {
-      GLfloat f[4];
-      GLuint ui[4];
-      GLint i[4];
-   } BorderColor;               /**< Interpreted according to texture format */
-   GLenum WrapS;		/**< S-axis texture image wrap mode */
-   GLenum WrapT;		/**< T-axis texture image wrap mode */
-   GLenum WrapR;		/**< R-axis texture image wrap mode */
-   GLenum MinFilter;		/**< minification filter */
-   GLenum MagFilter;		/**< magnification filter */
-   GLfloat MinLod;		/**< min lambda, OpenGL 1.2 */
-   GLfloat MaxLod;		/**< max lambda, OpenGL 1.2 */
-   GLfloat LodBias;		/**< OpenGL 1.4 */
-   GLint BaseLevel;		/**< min mipmap level, OpenGL 1.2 */
-   GLint MaxLevel;		/**< max mipmap level, OpenGL 1.2 */
-   GLfloat MaxAnisotropy;	/**< GL_EXT_texture_filter_anisotropic */
-   GLenum CompareMode;		/**< GL_ARB_shadow */
-   GLenum CompareFunc;		/**< GL_ARB_shadow */
-   GLfloat CompareFailValue;    /**< GL_ARB_shadow_ambient */
-   GLenum DepthMode;		/**< GL_ARB_depth_texture */
-   GLint _MaxLevel;		/**< actual max mipmap level (q in the spec) */
-   GLfloat _MaxLambda;		/**< = _MaxLevel - BaseLevel (q - b in spec) */
-   GLint CropRect[4];           /**< GL_OES_draw_texture */
-   GLenum Swizzle[4];           /**< GL_EXT_texture_swizzle */
-   GLuint _Swizzle;             /**< same as Swizzle, but SWIZZLE_* format */
-   GLboolean GenerateMipmap;    /**< GL_SGIS_generate_mipmap */
-   GLboolean _Complete;		/**< Is texture object complete? */
-   GLboolean _RenderToTexture;  /**< Any rendering to this texture? */
-   GLboolean Purgeable;         /**< Is the buffer purgeable under memory pressure? */
-   GLenum sRGBDecode;           /**< GL_DECODE_EXT or GL_SKIP_DECODE_EXT */
-
-   /** Actual texture images, indexed by [cube face] and [mipmap level] */
-   struct gl_texture_image *Image[MAX_FACES][MAX_TEXTURE_LEVELS];
-
-   /** GL_EXT_paletted_texture */
-   struct gl_color_table Palette;
-
-   /**
-    * \name For device driver.
-    * Note: instead of attaching driver data to this pointer, it's preferable
-    * to instead use this struct as a base class for your own texture object
-    * class.  Driver->NewTextureObject() can be used to implement the
-    * allocation.
-    */
-   void *DriverData;	/**< Arbitrary device driver data */
-};
-
-
-/** Up to four combiner sources are possible with GL_NV_texture_env_combine4 */
-#define MAX_COMBINER_TERMS 4
-
-
-/**
- * Texture combine environment state.
- */
-struct gl_tex_env_combine_state
-{
-   GLenum ModeRGB;       /**< GL_REPLACE, GL_DECAL, GL_ADD, etc. */
-   GLenum ModeA;         /**< GL_REPLACE, GL_DECAL, GL_ADD, etc. */
-   /** Source terms: GL_PRIMARY_COLOR, GL_TEXTURE, etc */
-   GLenum SourceRGB[MAX_COMBINER_TERMS];
-   GLenum SourceA[MAX_COMBINER_TERMS];
-   /** Source operands: GL_SRC_COLOR, GL_ONE_MINUS_SRC_COLOR, etc */
-   GLenum OperandRGB[MAX_COMBINER_TERMS];
-   GLenum OperandA[MAX_COMBINER_TERMS];
-   GLuint ScaleShiftRGB; /**< 0, 1 or 2 */
-   GLuint ScaleShiftA;   /**< 0, 1 or 2 */
-   GLuint _NumArgsRGB;   /**< Number of inputs used for the RGB combiner */
-   GLuint _NumArgsA;     /**< Number of inputs used for the A combiner */
-};
-
-
-/**
- * Texture coord generation state.
- */
-struct gl_texgen
-{
-   GLenum Mode;         /**< GL_EYE_LINEAR, GL_SPHERE_MAP, etc */
-   GLbitfield _ModeBit; /**< TEXGEN_x bit corresponding to Mode */
-   GLfloat ObjectPlane[4];
-   GLfloat EyePlane[4];
-};
-
-
-/**
- * Texture unit state.  Contains enable flags, texture environment/function/
- * combiners, texgen state, pointers to current texture objects and
- * post-filter color tables.
- */
-struct gl_texture_unit
-{
-   GLbitfield Enabled;          /**< bitmask of TEXTURE_*_BIT flags */
-   GLbitfield _ReallyEnabled;   /**< 0 or exactly one of TEXTURE_*_BIT flags */
-
-   GLenum EnvMode;              /**< GL_MODULATE, GL_DECAL, GL_BLEND, etc. */
-   GLfloat EnvColor[4];
-
-   struct gl_texgen GenS;
-   struct gl_texgen GenT;
-   struct gl_texgen GenR;
-   struct gl_texgen GenQ;
-   GLbitfield TexGenEnabled;	/**< Bitwise-OR of [STRQ]_BIT values */
-   GLbitfield _GenFlags;	/**< Bitwise-OR of Gen[STRQ]._ModeBit */
-
-   GLfloat LodBias;		/**< for biasing mipmap levels */
-   GLenum BumpTarget;
-   GLfloat RotMatrix[4]; /* 2x2 matrix */
-
-   /** 
-    * \name GL_EXT_texture_env_combine 
-    */
-   struct gl_tex_env_combine_state Combine;
-
-   /**
-    * Derived state based on \c EnvMode and the \c BaseFormat of the
-    * currently enabled texture.
-    */
-   struct gl_tex_env_combine_state _EnvMode;
-
-   /**
-    * Currently enabled combiner state.  This will point to either
-    * \c Combine or \c _EnvMode.
-    */
-   struct gl_tex_env_combine_state *_CurrentCombine;
-
-   /** Current texture object pointers */
-   struct gl_texture_object *CurrentTex[NUM_TEXTURE_TARGETS];
-
-   /** Points to highest priority, complete and enabled texture object */
-   struct gl_texture_object *_Current;
-
-   /** GL_SGI_texture_color_table */
-   /*@{*/
-   struct gl_color_table ColorTable;
-   struct gl_color_table ProxyColorTable;
-   GLboolean ColorTableEnabled;
-   /*@}*/
-};
-
-
-/**
- * Texture attribute group (GL_TEXTURE_BIT).
- */
-struct gl_texture_attrib
-{
-   GLuint CurrentUnit;   /**< GL_ACTIVE_TEXTURE */
-   struct gl_texture_unit Unit[MAX_COMBINED_TEXTURE_IMAGE_UNITS];
-
-   struct gl_texture_object *ProxyTex[NUM_TEXTURE_TARGETS];
-
-   /** GL_ARB_seamless_cubemap */
-   GLboolean CubeMapSeamless;
-
-   /** GL_EXT_shared_texture_palette */
-   GLboolean SharedPalette;
-   struct gl_color_table Palette;
-
-   /** Texture units/samplers used by vertex or fragment texturing */
-   GLbitfield _EnabledUnits;
-
-   /** Texture coord units/sets used for fragment texturing */
-   GLbitfield _EnabledCoordUnits;
-
-   /** Texture coord units that have texgen enabled */
-   GLbitfield _TexGenEnabled;
-
-   /** Texture coord units that have non-identity matrices */
-   GLbitfield _TexMatEnabled;
-
-   /** Bitwise-OR of all Texture.Unit[i]._GenFlags */
-   GLbitfield _GenFlags;
-};
-
-
-/**
- * Transformation attribute group (GL_TRANSFORM_BIT).
- */
-struct gl_transform_attrib
-{
-   GLenum MatrixMode;				/**< Matrix mode */
-   GLfloat EyeUserPlane[MAX_CLIP_PLANES][4];	/**< User clip planes */
-   GLfloat _ClipUserPlane[MAX_CLIP_PLANES][4];	/**< derived */
-   GLbitfield ClipPlanesEnabled;                /**< on/off bitmask */
-   GLboolean Normalize;				/**< Normalize all normals? */
-   GLboolean RescaleNormals;			/**< GL_EXT_rescale_normal */
-   GLboolean RasterPositionUnclipped;           /**< GL_IBM_rasterpos_clip */
-   GLboolean DepthClamp;			/**< GL_ARB_depth_clamp */
-
-   GLfloat CullEyePos[4];
-   GLfloat CullObjPos[4];
-};
-
-
-/**
- * Viewport attribute group (GL_VIEWPORT_BIT).
- */
-struct gl_viewport_attrib
-{
-   GLint X, Y;			/**< position */
-   GLsizei Width, Height;	/**< size */
-   GLfloat Near, Far;		/**< Depth buffer range */
-   GLmatrix _WindowMap;		/**< Mapping transformation as a matrix. */
-};
-
-
-/**
- * GL_ARB_vertex/pixel_buffer_object buffer object
- */
-struct gl_buffer_object
-{
-   _glthread_Mutex Mutex;
-   GLint RefCount;
-   GLuint Name;
-   GLenum Usage;        /**< GL_STREAM_DRAW_ARB, GL_STREAM_READ_ARB, etc. */
-   GLsizeiptrARB Size;  /**< Size of buffer storage in bytes */
-   GLubyte *Data;       /**< Location of storage either in RAM or VRAM. */
-   /** Fields describing a mapped buffer */
-   /*@{*/
-   GLbitfield AccessFlags; /**< Mask of GL_MAP_x_BIT flags */
-   GLvoid *Pointer;     /**< User-space address of mapping */
-   GLintptr Offset;     /**< Mapped offset */
-   GLsizeiptr Length;   /**< Mapped length */
-   /*@}*/
-   GLboolean Written;   /**< Ever written to? (for debugging) */
-   GLboolean Purgeable; /**< Is the buffer purgeable under memory pressure? */
-};
-
-
-/**
- * Client pixel packing/unpacking attributes
- */
-struct gl_pixelstore_attrib
-{
-   GLint Alignment;
-   GLint RowLength;
-   GLint SkipPixels;
-   GLint SkipRows;
-   GLint ImageHeight;
-   GLint SkipImages;
-   GLboolean SwapBytes;
-   GLboolean LsbFirst;
-   GLboolean ClientStorage; /**< GL_APPLE_client_storage */
-   GLboolean Invert;        /**< GL_MESA_pack_invert */
-   struct gl_buffer_object *BufferObj; /**< GL_ARB_pixel_buffer_object */
-};
-
-
-/**
- * Client vertex array attributes
- */
-struct gl_client_array
-{
-   GLint Size;                  /**< components per element (1,2,3,4) */
-   GLenum Type;                 /**< datatype: GL_FLOAT, GL_INT, etc */
-   GLenum Format;               /**< default: GL_RGBA, but may be GL_BGRA */
-   GLsizei Stride;		/**< user-specified stride */
-   GLsizei StrideB;		/**< actual stride in bytes */
-   const GLubyte *Ptr;          /**< Points to array data */
-   GLboolean Enabled;		/**< Enabled flag is a boolean */
-   GLboolean Normalized;        /**< GL_ARB_vertex_program */
-   GLboolean Integer;           /**< Integer-valued? */
-   GLuint InstanceDivisor;      /**< GL_ARB_instanced_arrays */
-   GLuint _ElementSize;         /**< size of each element in bytes */
-
-   struct gl_buffer_object *BufferObj;/**< GL_ARB_vertex_buffer_object */
-   GLuint _MaxElement;          /**< max element index into array buffer + 1 */
-};
-
-
-/**
- * Collection of vertex arrays.  Defined by the GL_APPLE_vertex_array_object
- * extension, but a nice encapsulation in any case.
- */
-struct gl_array_object
-{
-   /** Name of the array object as received from glGenVertexArrayAPPLE. */
-   GLuint Name;
-
-   GLint RefCount;
-   _glthread_Mutex Mutex;
-   GLboolean VBOonly;  /**< require all arrays to live in VBOs? */
-
-   /** Conventional vertex arrays */
-   /*@{*/
-   struct gl_client_array Vertex;
-   struct gl_client_array Weight;
-   struct gl_client_array Normal;
-   struct gl_client_array Color;
-   struct gl_client_array SecondaryColor;
-   struct gl_client_array FogCoord;
-   struct gl_client_array Index;
-   struct gl_client_array EdgeFlag;
-   struct gl_client_array TexCoord[MAX_TEXTURE_COORD_UNITS];
-   struct gl_client_array PointSize;
-   /*@}*/
-
-   /**
-    * Generic arrays for vertex programs/shaders.
-    * For NV vertex programs, these attributes alias and take priority
-    * over the conventional attribs above.  For ARB vertex programs and
-    * GLSL vertex shaders, these attributes are separate.
-    */
-   struct gl_client_array VertexAttrib[MAX_VERTEX_GENERIC_ATTRIBS];
-
-   /** Mask of _NEW_ARRAY_* values indicating which arrays are enabled */
-   GLbitfield _Enabled;
-
-   /**
-    * Min of all enabled arrays' _MaxElement.  When arrays reside inside VBOs
-    * we can determine the max legal (in bounds) glDrawElements array index.
-    */
-   GLuint _MaxElement;
-};
-
-
-/**
- * Vertex array state
- */
-struct gl_array_attrib
-{
-   /** Currently bound array object. See _mesa_BindVertexArrayAPPLE() */
-   struct gl_array_object *ArrayObj;
-
-   /** The default vertex array object */
-   struct gl_array_object *DefaultArrayObj;
-
-   /** Array objects (GL_ARB/APPLE_vertex_array_object) */
-   struct _mesa_HashTable *Objects;
-
-   GLint ActiveTexture;		/**< Client Active Texture */
-   GLuint LockFirst;            /**< GL_EXT_compiled_vertex_array */
-   GLuint LockCount;            /**< GL_EXT_compiled_vertex_array */
-
-   /** GL 3.1 (slightly different from GL_NV_primitive_restart) */
-   GLboolean PrimitiveRestart;
-   GLuint RestartIndex;
-
-   GLbitfield NewState;		/**< mask of _NEW_ARRAY_* values */
-
-   /* GL_ARB_vertex_buffer_object */
-   struct gl_buffer_object *ArrayBufferObj;
-   struct gl_buffer_object *ElementArrayBufferObj;
-};
-
-
-/**
- * Feedback buffer state
- */
-struct gl_feedback
-{
-   GLenum Type;
-   GLbitfield _Mask;    /**< FB_* bits */
-   GLfloat *Buffer;
-   GLuint BufferSize;
-   GLuint Count;
-};
-
-
-/**
- * Selection buffer state
- */
-struct gl_selection
-{
-   GLuint *Buffer;	/**< selection buffer */
-   GLuint BufferSize;	/**< size of the selection buffer */
-   GLuint BufferCount;	/**< number of values in the selection buffer */
-   GLuint Hits;		/**< number of records in the selection buffer */
-   GLuint NameStackDepth; /**< name stack depth */
-   GLuint NameStack[MAX_NAME_STACK_DEPTH]; /**< name stack */
-   GLboolean HitFlag;	/**< hit flag */
-   GLfloat HitMinZ;	/**< minimum hit depth */
-   GLfloat HitMaxZ;	/**< maximum hit depth */
-};
-
-
-/**
- * 1-D Evaluator control points
- */
-struct gl_1d_map
-{
-   GLuint Order;	/**< Number of control points */
-   GLfloat u1, u2, du;	/**< u1, u2, 1.0/(u2-u1) */
-   GLfloat *Points;	/**< Points to contiguous control points */
-};
-
-
-/**
- * 2-D Evaluator control points
- */
-struct gl_2d_map
-{
-   GLuint Uorder;		/**< Number of control points in U dimension */
-   GLuint Vorder;		/**< Number of control points in V dimension */
-   GLfloat u1, u2, du;
-   GLfloat v1, v2, dv;
-   GLfloat *Points;		/**< Points to contiguous control points */
-};
-
-
-/**
- * All evaluator control point state
- */
-struct gl_evaluators
-{
-   /** 
-    * \name 1-D maps
-    */
-   /*@{*/
-   struct gl_1d_map Map1Vertex3;
-   struct gl_1d_map Map1Vertex4;
-   struct gl_1d_map Map1Index;
-   struct gl_1d_map Map1Color4;
-   struct gl_1d_map Map1Normal;
-   struct gl_1d_map Map1Texture1;
-   struct gl_1d_map Map1Texture2;
-   struct gl_1d_map Map1Texture3;
-   struct gl_1d_map Map1Texture4;
-   struct gl_1d_map Map1Attrib[16];  /**< GL_NV_vertex_program */
-   /*@}*/
-
-   /** 
-    * \name 2-D maps 
-    */
-   /*@{*/
-   struct gl_2d_map Map2Vertex3;
-   struct gl_2d_map Map2Vertex4;
-   struct gl_2d_map Map2Index;
-   struct gl_2d_map Map2Color4;
-   struct gl_2d_map Map2Normal;
-   struct gl_2d_map Map2Texture1;
-   struct gl_2d_map Map2Texture2;
-   struct gl_2d_map Map2Texture3;
-   struct gl_2d_map Map2Texture4;
-   struct gl_2d_map Map2Attrib[16];  /**< GL_NV_vertex_program */
-   /*@}*/
-};
-
-
-/**
- * Names of the various vertex/fragment program register files, etc.
- *
- * NOTE: first four tokens must fit into 2 bits (see t_vb_arbprogram.c)
- * All values should fit in a 4-bit field.
- *
- * NOTE: PROGRAM_ENV_PARAM, PROGRAM_STATE_VAR, PROGRAM_NAMED_PARAM,
- * PROGRAM_CONSTANT, and PROGRAM_UNIFORM can all be considered to
- * be "uniform" variables since they can only be set outside glBegin/End.
- * They're also all stored in the same Parameters array.
- */
-typedef enum
-{
-   PROGRAM_TEMPORARY,   /**< machine->Temporary[] */
-   PROGRAM_INPUT,       /**< machine->Inputs[] */
-   PROGRAM_OUTPUT,      /**< machine->Outputs[] */
-   PROGRAM_VARYING,     /**< machine->Inputs[]/Outputs[] */
-   PROGRAM_LOCAL_PARAM, /**< gl_program->LocalParams[] */
-   PROGRAM_ENV_PARAM,   /**< gl_program->Parameters[] */
-   PROGRAM_STATE_VAR,   /**< gl_program->Parameters[] */
-   PROGRAM_NAMED_PARAM, /**< gl_program->Parameters[] */
-   PROGRAM_CONSTANT,    /**< gl_program->Parameters[] */
-   PROGRAM_UNIFORM,     /**< gl_program->Parameters[] */
-   PROGRAM_WRITE_ONLY,  /**< A dummy, write-only register */
-   PROGRAM_ADDRESS,     /**< machine->AddressReg */
-   PROGRAM_SAMPLER,     /**< for shader samplers, compile-time only */
-   PROGRAM_SYSTEM_VALUE,/**< InstanceId, PrimitiveID, etc. */
-   PROGRAM_UNDEFINED,   /**< Invalid/TBD value */
-   PROGRAM_FILE_MAX
-} gl_register_file;
-
-
-/**
- * If the register file is PROGRAM_SYSTEM_VALUE, the register index will be
- * one of these values.
- */
-typedef enum
-{
-   SYSTEM_VALUE_FRONT_FACE,  /**< Fragment shader only (not done yet) */
-   SYSTEM_VALUE_INSTANCE_ID, /**< Vertex shader only */
-   SYSTEM_VALUE_MAX          /**< Number of values */
-} gl_system_value;
-
-
-/** Vertex and fragment instructions */
-struct prog_instruction;
-struct gl_program_parameter_list;
-struct gl_uniform_list;
-
-
-/**
- * Base class for any kind of program object
- */
-struct gl_program
-{
-   GLuint Id;
-   GLubyte *String;  /**< Null-terminated program text */
-   GLint RefCount;
-   GLenum Target;    /**< GL_VERTEX/FRAGMENT_PROGRAM_ARB, GL_FRAGMENT_PROGRAM_NV */
-   GLenum Format;    /**< String encoding format */
-   GLboolean Resident;
-
-   struct prog_instruction *Instructions;
-
-   GLbitfield InputsRead;     /**< Bitmask of which input regs are read */
-   GLbitfield64 OutputsWritten; /**< Bitmask of which output regs are written */
-   GLbitfield SystemValuesRead;   /**< Bitmask of SYSTEM_VALUE_x inputs used */
-   GLbitfield InputFlags[MAX_PROGRAM_INPUTS];   /**< PROG_PARAM_BIT_x flags */
-   GLbitfield OutputFlags[MAX_PROGRAM_OUTPUTS]; /**< PROG_PARAM_BIT_x flags */
-   GLbitfield TexturesUsed[MAX_TEXTURE_UNITS];  /**< TEXTURE_x_BIT bitmask */
-   GLbitfield SamplersUsed;   /**< Bitfield of which samplers are used */
-   GLbitfield ShadowSamplers; /**< Texture units used for shadow sampling. */
-
-
-   /** Named parameters, constants, etc. from program text */
-   struct gl_program_parameter_list *Parameters;
-   /** Numbered local parameters */
-   GLfloat LocalParams[MAX_PROGRAM_LOCAL_PARAMS][4];
-
-   /** Vertex/fragment shader varying vars */
-   struct gl_program_parameter_list *Varying;
-   /** Vertex program user-defined attributes */
-   struct gl_program_parameter_list *Attributes;
-
-   /** Map from sampler unit to texture unit (set by glUniform1i()) */
-   GLubyte SamplerUnits[MAX_SAMPLERS];
-   /** Which texture target is being sampled (TEXTURE_1D/2D/3D/etc_INDEX) */
-   gl_texture_index SamplerTargets[MAX_SAMPLERS];
-
-   /** Bitmask of which register files are read/written with indirect
-    * addressing.  Mask of (1 << PROGRAM_x) bits.
-    */
-   GLbitfield IndirectRegisterFiles;
-
-   /** Logical counts */
-   /*@{*/
-   GLuint NumInstructions;
-   GLuint NumTemporaries;
-   GLuint NumParameters;
-   GLuint NumAttributes;
-   GLuint NumAddressRegs;
-   GLuint NumAluInstructions;
-   GLuint NumTexInstructions;
-   GLuint NumTexIndirections;
-   /*@}*/
-   /** Native, actual h/w counts */
-   /*@{*/
-   GLuint NumNativeInstructions;
-   GLuint NumNativeTemporaries;
-   GLuint NumNativeParameters;
-   GLuint NumNativeAttributes;
-   GLuint NumNativeAddressRegs;
-   GLuint NumNativeAluInstructions;
-   GLuint NumNativeTexInstructions;
-   GLuint NumNativeTexIndirections;
-   /*@}*/
-};
-
-
-/** Vertex program object */
-struct gl_vertex_program
-{
-   struct gl_program Base;   /**< base class */
-   GLboolean IsNVProgram;    /**< is this a GL_NV_vertex_program program? */
-   GLboolean IsPositionInvariant;
-};
-
-
-/** Geometry program object */
-struct gl_geometry_program
-{
-   struct gl_program Base;   /**< base class */
-
-   GLint VerticesOut;
-   GLenum InputType;  /**< GL_POINTS, GL_LINES, GL_LINES_ADJACENCY_ARB,
-                           GL_TRIANGLES, or GL_TRIANGLES_ADJACENCY_ARB */
-   GLenum OutputType; /**< GL_POINTS, GL_LINE_STRIP or GL_TRIANGLE_STRIP */
-};
-
-
-/** Fragment program object */
-struct gl_fragment_program
-{
-   struct gl_program Base;   /**< base class */
-   GLenum FogOption;
-   GLboolean UsesKill;          /**< shader uses KIL instruction */
-   GLboolean OriginUpperLeft;
-   GLboolean PixelCenterInteger;
-   enum gl_frag_depth_layout FragDepthLayout;
-};
-
-
-/**
- * State common to vertex and fragment programs.
- */
-struct gl_program_state
-{
-   GLint ErrorPos;                       /* GL_PROGRAM_ERROR_POSITION_ARB/NV */
-   const char *ErrorString;              /* GL_PROGRAM_ERROR_STRING_ARB/NV */
-};
-
-
-/**
- * Context state for vertex programs.
- */
-struct gl_vertex_program_state
-{
-   GLboolean Enabled;            /**< User-set GL_VERTEX_PROGRAM_ARB/NV flag */
-   GLboolean _Enabled;           /**< Enabled and _valid_ user program? */
-   GLboolean PointSizeEnabled;   /**< GL_VERTEX_PROGRAM_POINT_SIZE_ARB/NV */
-   GLboolean TwoSideEnabled;     /**< GL_VERTEX_PROGRAM_TWO_SIDE_ARB/NV */
-   struct gl_vertex_program *Current;  /**< User-bound vertex program */
-
-   /** Currently enabled and valid vertex program (including internal
-    * programs, user-defined vertex programs and GLSL vertex shaders).
-    * This is the program we must use when rendering.
-    */
-   struct gl_vertex_program *_Current;
-
-   GLfloat Parameters[MAX_PROGRAM_ENV_PARAMS][4]; /**< Env params */
-
-   /* For GL_NV_vertex_program only: */
-   GLenum TrackMatrix[MAX_PROGRAM_ENV_PARAMS / 4];
-   GLenum TrackMatrixTransform[MAX_PROGRAM_ENV_PARAMS / 4];
-
-   /** Should fixed-function T&L be implemented with a vertex prog? */
-   GLboolean _MaintainTnlProgram;
-
-   /** Program to emulate fixed-function T&L (see above) */
-   struct gl_vertex_program *_TnlProgram;
-
-   /** Cache of fixed-function programs */
-   struct gl_program_cache *Cache;
-
-   GLboolean _Overriden;
-};
-
-
-/**
- * Context state for geometry programs.
- */
-struct gl_geometry_program_state
-{
-   GLboolean Enabled;               /**< GL_ARB_GEOMETRY_SHADER4 */
-   GLboolean _Enabled;              /**< Enabled and valid program? */
-   struct gl_geometry_program *Current;  /**< user-bound geometry program */
-
-   /** Currently enabled and valid program (including internal programs
-    * and compiled shader programs).
-    */
-   struct gl_geometry_program *_Current;
-
-   GLfloat Parameters[MAX_PROGRAM_ENV_PARAMS][4]; /**< Env params */
-
-   /** Cache of fixed-function programs */
-   struct gl_program_cache *Cache;
-};
-
-/**
- * Context state for fragment programs.
- */
-struct gl_fragment_program_state
-{
-   GLboolean Enabled;     /**< User-set fragment program enable flag */
-   GLboolean _Enabled;    /**< Enabled and _valid_ user program? */
-   struct gl_fragment_program *Current;  /**< User-bound fragment program */
-
-   /** Currently enabled and valid fragment program (including internal
-    * programs, user-defined fragment programs and GLSL fragment shaders).
-    * This is the program we must use when rendering.
-    */
-   struct gl_fragment_program *_Current;
-
-   GLfloat Parameters[MAX_PROGRAM_ENV_PARAMS][4]; /**< Env params */
-
-   /** Should fixed-function texturing be implemented with a fragment prog? */
-   GLboolean _MaintainTexEnvProgram;
-
-   /** Program to emulate fixed-function texture env/combine (see above) */
-   struct gl_fragment_program *_TexEnvProgram;
-
-   /** Cache of fixed-function programs */
-   struct gl_program_cache *Cache;
-};
-
-
-/**
- * ATI_fragment_shader runtime state
- */
-#define ATI_FS_INPUT_PRIMARY 0
-#define ATI_FS_INPUT_SECONDARY 1
-
-struct atifs_instruction;
-struct atifs_setupinst;
-
-/**
- * ATI fragment shader
- */
-struct ati_fragment_shader
-{
-   GLuint Id;
-   GLint RefCount;
-   struct atifs_instruction *Instructions[2];
-   struct atifs_setupinst *SetupInst[2];
-   GLfloat Constants[8][4];
-   GLbitfield LocalConstDef;  /**< Indicates which constants have been set */
-   GLubyte numArithInstr[2];
-   GLubyte regsAssigned[2];
-   GLubyte NumPasses;         /**< 1 or 2 */
-   GLubyte cur_pass;
-   GLubyte last_optype;
-   GLboolean interpinp1;
-   GLboolean isValid;
-   GLuint swizzlerq;
-};
-
-/**
- * Context state for GL_ATI_fragment_shader
- */
-struct gl_ati_fragment_shader_state
-{
-   GLboolean Enabled;
-   GLboolean _Enabled;                  /**< enabled and valid shader? */
-   GLboolean Compiling;
-   GLfloat GlobalConstants[8][4];
-   struct ati_fragment_shader *Current;
-};
-
-
-/**
- * Occlusion/timer query object.
- */
-struct gl_query_object
-{
-   GLenum Target;      /**< The query target, when active */
-   GLuint Id;          /**< hash table ID/name */
-   GLuint64EXT Result; /**< the counter */
-   GLboolean Active;   /**< inside Begin/EndQuery */
-   GLboolean Ready;    /**< result is ready? */
-};
-
-
-/**
- * Context state for query objects.
- */
-struct gl_query_state
-{
-   struct _mesa_HashTable *QueryObjects;
-   struct gl_query_object *CurrentOcclusionObject; /* GL_ARB_occlusion_query */
-   struct gl_query_object *CurrentTimerObject;     /* GL_EXT_timer_query */
-
-   /** GL_NV_conditional_render */
-   struct gl_query_object *CondRenderQuery;
-
-   /** GL_EXT_transform_feedback */
-   struct gl_query_object *PrimitivesGenerated;
-   struct gl_query_object *PrimitivesWritten;
-
-   /** GL_ARB_timer_query */
-   struct gl_query_object *TimeElapsed;
-
-   GLenum CondRenderMode;
-};
-
-
-/** Sync object state */
-struct gl_sync_object {
-   struct simple_node link;
-   GLenum Type;               /**< GL_SYNC_FENCE */
-   GLuint Name;               /**< Fence name */
-   GLint RefCount;            /**< Reference count */
-   GLboolean DeletePending;   /**< Object was deleted while there were still
-			       * live references (e.g., sync not yet finished)
-			       */
-   GLenum SyncCondition;
-   GLbitfield Flags;          /**< Flags passed to glFenceSync */
-   GLuint StatusFlag:1;       /**< Has the sync object been signaled? */
-};
-
-
-/** Set by #pragma directives */
-struct gl_sl_pragmas
-{
-   GLboolean IgnoreOptimize;  /**< ignore #pragma optimize(on/off) ? */
-   GLboolean IgnoreDebug;     /**< ignore #pragma debug(on/off) ? */
-   GLboolean Optimize;  /**< defaults on */
-   GLboolean Debug;     /**< defaults off */
-};
-
-
-/**
- * A GLSL vertex or fragment shader object.
- */
-struct gl_shader
-{
-   GLenum Type;  /**< GL_FRAGMENT_SHADER || GL_VERTEX_SHADER || GL_GEOMETRY_SHADER_ARB (first field!) */
-   GLuint Name;  /**< AKA the handle */
-   GLint RefCount;  /**< Reference count */
-   GLboolean DeletePending;
-   GLboolean CompileStatus;
-   const GLchar *Source;  /**< Source code string */
-   GLuint SourceChecksum;       /**< for debug/logging purposes */
-   struct gl_program *Program;  /**< Post-compile assembly code */
-   GLchar *InfoLog;
-   struct gl_sl_pragmas Pragmas;
-
-   unsigned Version;       /**< GLSL version used for linking */
-
-   struct exec_list *ir;
-   struct glsl_symbol_table *symbols;
-
-   /** Shaders containing built-in functions that are used for linking. */
-   struct gl_shader *builtins_to_link[16];
-   unsigned num_builtins_to_link;
-};
-
-
-/**
- * A GLSL program object.
- * Basically a linked collection of vertex and fragment shaders.
- */
-struct gl_shader_program
-{
-   GLenum Type;  /**< Always GL_SHADER_PROGRAM (internal token) */
-   GLuint Name;  /**< aka handle or ID */
-   GLint RefCount;  /**< Reference count */
-   GLboolean DeletePending;
-
-   GLuint NumShaders;          /**< number of attached shaders */
-   struct gl_shader **Shaders; /**< List of attached the shaders */
-
-   /** User-defined attribute bindings (glBindAttribLocation) */
-   struct gl_program_parameter_list *Attributes;
-
-   /** Transform feedback varyings */
-   struct {
-      GLenum BufferMode;
-      GLuint NumVarying;
-      GLchar **VaryingNames;  /**< Array [NumVarying] of char * */
-   } TransformFeedback;
-
-   /** Geometry shader state - copied into gl_geometry_program at link time */
-   struct {
-      GLint VerticesOut;
-      GLenum InputType;  /**< GL_POINTS, GL_LINES, GL_LINES_ADJACENCY_ARB,
-                              GL_TRIANGLES, or GL_TRIANGLES_ADJACENCY_ARB */
-      GLenum OutputType; /**< GL_POINTS, GL_LINE_STRIP or GL_TRIANGLE_STRIP */
-   } Geom;
-
-   /* post-link info: */
-   struct gl_vertex_program *VertexProgram;     /**< Linked vertex program */
-   struct gl_fragment_program *FragmentProgram; /**< Linked fragment prog */
-   struct gl_geometry_program *GeometryProgram; /**< Linked geometry prog */
-   struct gl_uniform_list *Uniforms;
-   struct gl_program_parameter_list *Varying;
-   GLboolean LinkStatus;   /**< GL_LINK_STATUS */
-   GLboolean Validated;
-   GLboolean _Used;        /**< Ever used for drawing? */
-   GLchar *InfoLog;
-
-   unsigned Version;       /**< GLSL version used for linking */
-
-   /**
-    * Per-stage shaders resulting from the first stage of linking.
-    *
-    * Set of linked shaders for this program.  The array is accessed using the
-    * \c MESA_SHADER_* defines.  Entries for non-existent stages will be
-    * \c NULL.
-    */
-   struct gl_shader *_LinkedShaders[MESA_SHADER_TYPES];
-};   
-
-
-#define GLSL_DUMP      0x1  /**< Dump shaders to stdout */
-#define GLSL_LOG       0x2  /**< Write shaders to files */
-#define GLSL_OPT       0x4  /**< Force optimizations (override pragmas) */
-#define GLSL_NO_OPT    0x8  /**< Force no optimizations (override pragmas) */
-#define GLSL_UNIFORMS 0x10  /**< Print glUniform calls */
-#define GLSL_NOP_VERT 0x20  /**< Force no-op vertex shaders */
-#define GLSL_NOP_FRAG 0x40  /**< Force no-op fragment shaders */
-#define GLSL_USE_PROG 0x80  /**< Log glUseProgram calls */
-
-
-/**
- * Context state for GLSL vertex/fragment shaders.
- */
-struct gl_shader_state
-{
-   /**
-    * Programs used for rendering
-    *
-    * There is a separate program set for each shader stage.  If
-    * GL_EXT_separate_shader_objects is not supported, each of these must point
-    * to \c NULL or to the same program.
-    */
-   struct gl_shader_program *CurrentVertexProgram;
-   struct gl_shader_program *CurrentGeometryProgram;
-   struct gl_shader_program *CurrentFragmentProgram;
-
-   /**
-    * Program used by glUniform calls.
-    *
-    * Explicitly set by \c glUseProgram and \c glActiveProgramEXT.
-    */
-   struct gl_shader_program *ActiveProgram;
-
-   void *MemPool;
-
-   GLbitfield Flags;                    /**< Mask of GLSL_x flags */
-};
-
-/**
- * Compiler options for a single GLSL shaders type
- */
-struct gl_shader_compiler_options
-{
-   /** Driver-selectable options: */
-   GLboolean EmitCondCodes;             /**< Use condition codes? */
-   GLboolean EmitNVTempInitialization;  /**< 0-fill NV temp registers */
-   /**
-    * Attempts to flatten all ir_if (OPCODE_IF) for GPUs that can't
-    * support control flow.
-    */
-   GLboolean EmitNoIfs;
-   GLboolean EmitNoLoops;
-   GLboolean EmitNoFunctions;
-   GLboolean EmitNoCont;                  /**< Emit CONT opcode? */
-   GLboolean EmitNoMainReturn;            /**< Emit CONT/RET opcodes? */
-   GLboolean EmitNoNoise;                 /**< Emit NOISE opcodes? */
-   GLboolean EmitNoPow;                   /**< Emit POW opcodes? */
-
-   /**
-    * \name Forms of indirect addressing the driver cannot do.
-    */
-   /*@{*/
-   GLboolean EmitNoIndirectInput;   /**< No indirect addressing of inputs */
-   GLboolean EmitNoIndirectOutput;  /**< No indirect addressing of outputs */
-   GLboolean EmitNoIndirectTemp;    /**< No indirect addressing of temps */
-   GLboolean EmitNoIndirectUniform; /**< No indirect addressing of constants */
-   /*@}*/
-
-   GLuint MaxUnrollIterations;
-
-   struct gl_sl_pragmas DefaultPragmas; /**< Default #pragma settings */
-};
-
-/**
- * Transform feedback object state
- */
-struct gl_transform_feedback_object
-{
-   GLuint Name;  /**< AKA the object ID */
-   GLint RefCount;
-   GLboolean Active;  /**< Is transform feedback enabled? */
-   GLboolean Paused;  /**< Is transform feedback paused? */
-
-   /** The feedback buffers */
-   GLuint BufferNames[MAX_FEEDBACK_ATTRIBS];
-   struct gl_buffer_object *Buffers[MAX_FEEDBACK_ATTRIBS];
-
-   /** Start of feedback data in dest buffer */
-   GLintptr Offset[MAX_FEEDBACK_ATTRIBS];
-   /** Max data to put into dest buffer (in bytes) */
-   GLsizeiptr Size[MAX_FEEDBACK_ATTRIBS];
-};
-
-
-/**
- * Context state for transform feedback.
- */
-struct gl_transform_feedback
-{
-   GLenum Mode;       /**< GL_POINTS, GL_LINES or GL_TRIANGLES */
-
-   GLboolean RasterDiscard;  /**< GL_RASTERIZER_DISCARD */
-
-   /** The general binding point (GL_TRANSFORM_FEEDBACK_BUFFER) */
-   struct gl_buffer_object *CurrentBuffer;
-
-   /** The table of all transform feedback objects */
-   struct _mesa_HashTable *Objects;
-
-   /** The current xform-fb object (GL_TRANSFORM_FEEDBACK_BINDING) */
-   struct gl_transform_feedback_object *CurrentObject;
-
-   /** The default xform-fb object (Name==0) */
-   struct gl_transform_feedback_object *DefaultObject;
-};
-
-
-
-/**
- * State which can be shared by multiple contexts:
- */
-struct gl_shared_state
-{
-   _glthread_Mutex Mutex;		   /**< for thread safety */
-   GLint RefCount;			   /**< Reference count */
-   struct _mesa_HashTable *DisplayList;	   /**< Display lists hash table */
-   struct _mesa_HashTable *TexObjects;	   /**< Texture objects hash table */
-
-   /** Default texture objects (shared by all texture units) */
-   struct gl_texture_object *DefaultTex[NUM_TEXTURE_TARGETS];
-
-   /** Fallback texture used when a bound texture is incomplete */
-   struct gl_texture_object *FallbackTex;
-
-   /**
-    * \name Thread safety and statechange notification for texture
-    * objects. 
-    *
-    * \todo Improve the granularity of locking.
-    */
-   /*@{*/
-   _glthread_Mutex TexMutex;		/**< texobj thread safety */
-   GLuint TextureStateStamp;	        /**< state notification for shared tex */
-   /*@}*/
-
-   /** Default buffer object for vertex arrays that aren't in VBOs */
-   struct gl_buffer_object *NullBufferObj;
-
-   /**
-    * \name Vertex/geometry/fragment programs
-    */
-   /*@{*/
-   struct _mesa_HashTable *Programs; /**< All vertex/fragment programs */
-   struct gl_vertex_program *DefaultVertexProgram;
-   struct gl_fragment_program *DefaultFragmentProgram;
-   struct gl_geometry_program *DefaultGeometryProgram;
-   /*@}*/
-
-   /* GL_ATI_fragment_shader */
-   struct _mesa_HashTable *ATIShaders;
-   struct ati_fragment_shader *DefaultFragmentShader;
-
-   struct _mesa_HashTable *BufferObjects;
-
-   /** Table of both gl_shader and gl_shader_program objects */
-   struct _mesa_HashTable *ShaderObjects;
-
-   /* GL_EXT_framebuffer_object */
-   struct _mesa_HashTable *RenderBuffers;
-   struct _mesa_HashTable *FrameBuffers;
-
-   /* GL_ARB_sync */
-   struct simple_node SyncObjects;
-
-   void *DriverData;  /**< Device driver shared state */
-};
-
-
-
-
-/**
- * A renderbuffer stores colors or depth values or stencil values.
- * A framebuffer object will have a collection of these.
- * Data are read/written to the buffer with a handful of Get/Put functions.
- *
- * Instances of this object are allocated with the Driver's NewRenderbuffer
- * hook.  Drivers will likely wrap this class inside a driver-specific
- * class to simulate inheritance.
- */
-struct gl_renderbuffer
-{
-#define RB_MAGIC 0xaabbccdd
-   int Magic; /** XXX TEMPORARY DEBUG INFO */
-   _glthread_Mutex Mutex;		   /**< for thread safety */
-   GLuint ClassID;        /**< Useful for drivers */
-   GLuint Name;
-   GLint RefCount;
-   GLuint Width, Height;
-   GLboolean Purgeable;   /**< Is the buffer purgeable under memory pressure? */
-
-   GLenum InternalFormat; /**< The user-specified format */
-   GLenum _BaseFormat;    /**< Either GL_RGB, GL_RGBA, GL_DEPTH_COMPONENT or
-                               GL_STENCIL_INDEX. */
-   GLuint Format;         /**< The actual format: MESA_FORMAT_x */
-
-   GLubyte NumSamples;
-
-   GLenum DataType;      /**< Type of values passed to the Get/Put functions */
-   GLvoid *Data;        /**< This may not be used by some kinds of RBs */
-
-   /* Used to wrap one renderbuffer around another: */
-   struct gl_renderbuffer *Wrapped;
-
-   /* Delete this renderbuffer */
-   void (*Delete)(struct gl_renderbuffer *rb);
-
-   /* Allocate new storage for this renderbuffer */
-   GLboolean (*AllocStorage)(struct gl_context *ctx, struct gl_renderbuffer *rb,
-                             GLenum internalFormat,
-                             GLuint width, GLuint height);
-
-   /* Lock/Unlock are called before/after calling the Get/Put functions.
-    * Not sure this is the right place for these yet.
-   void (*Lock)(struct gl_context *ctx, struct gl_renderbuffer *rb);
-   void (*Unlock)(struct gl_context *ctx, struct gl_renderbuffer *rb);
-    */
-
-   /* Return a pointer to the element/pixel at (x,y).
-    * Should return NULL if the buffer memory can't be directly addressed.
-    */
-   void *(*GetPointer)(struct gl_context *ctx, struct gl_renderbuffer *rb,
-                       GLint x, GLint y);
-
-   /* Get/Read a row of values.
-    * The values will be of format _BaseFormat and type DataType.
-    */
-   void (*GetRow)(struct gl_context *ctx, struct gl_renderbuffer *rb, GLuint count,
-                  GLint x, GLint y, void *values);
-
-   /* Get/Read values at arbitrary locations.
-    * The values will be of format _BaseFormat and type DataType.
-    */
-   void (*GetValues)(struct gl_context *ctx, struct gl_renderbuffer *rb, GLuint count,
-                     const GLint x[], const GLint y[], void *values);
-
-   /* Put/Write a row of values.
-    * The values will be of format _BaseFormat and type DataType.
-    */
-   void (*PutRow)(struct gl_context *ctx, struct gl_renderbuffer *rb, GLuint count,
-                  GLint x, GLint y, const void *values, const GLubyte *mask);
-
-   /* Put/Write a row of RGB values.  This is a special-case routine that's
-    * only used for RGBA renderbuffers when the source data is GL_RGB. That's
-    * a common case for glDrawPixels and some triangle routines.
-    * The values will be of format GL_RGB and type DataType.
-    */
-   void (*PutRowRGB)(struct gl_context *ctx, struct gl_renderbuffer *rb, GLuint count,
-                    GLint x, GLint y, const void *values, const GLubyte *mask);
-
-
-   /* Put/Write a row of identical values.
-    * The values will be of format _BaseFormat and type DataType.
-    */
-   void (*PutMonoRow)(struct gl_context *ctx, struct gl_renderbuffer *rb, GLuint count,
-                     GLint x, GLint y, const void *value, const GLubyte *mask);
-
-   /* Put/Write values at arbitrary locations.
-    * The values will be of format _BaseFormat and type DataType.
-    */
-   void (*PutValues)(struct gl_context *ctx, struct gl_renderbuffer *rb, GLuint count,
-                     const GLint x[], const GLint y[], const void *values,
-                     const GLubyte *mask);
-   /* Put/Write identical values at arbitrary locations.
-    * The values will be of format _BaseFormat and type DataType.
-    */
-   void (*PutMonoValues)(struct gl_context *ctx, struct gl_renderbuffer *rb,
-                         GLuint count, const GLint x[], const GLint y[],
-                         const void *value, const GLubyte *mask);
-};
-
-
-/**
- * A renderbuffer attachment points to either a texture object (and specifies
- * a mipmap level, cube face or 3D texture slice) or points to a renderbuffer.
- */
-struct gl_renderbuffer_attachment
-{
-   GLenum Type;  /**< \c GL_NONE or \c GL_TEXTURE or \c GL_RENDERBUFFER_EXT */
-   GLboolean Complete;
-
-   /**
-    * If \c Type is \c GL_RENDERBUFFER_EXT, this stores a pointer to the
-    * application supplied renderbuffer object.
-    */
-   struct gl_renderbuffer *Renderbuffer;
-
-   /**
-    * If \c Type is \c GL_TEXTURE, this stores a pointer to the application
-    * supplied texture object.
-    */
-   struct gl_texture_object *Texture;
-   GLuint TextureLevel; /**< Attached mipmap level. */
-   GLuint CubeMapFace;  /**< 0 .. 5, for cube map textures. */
-   GLuint Zoffset;      /**< Slice for 3D textures,  or layer for both 1D
-                         * and 2D array textures */
-};
-
-
-/**
- * A framebuffer is a collection of renderbuffers (color, depth, stencil, etc).
- * In C++ terms, think of this as a base class from which device drivers
- * will make derived classes.
- */
-struct gl_framebuffer
-{
-   _glthread_Mutex Mutex;  /**< for thread safety */
-   /**
-    * If zero, this is a window system framebuffer.  If non-zero, this
-    * is a FBO framebuffer; note that for some devices (i.e. those with
-    * a natural pixel coordinate system for FBOs that differs from the
-    * OpenGL/Mesa coordinate system), this means that the viewport,
-    * polygon face orientation, and polygon stipple will have to be inverted.
-    */
-   GLuint Name;
-
-   GLint RefCount;
-   GLboolean DeletePending;
-
-   /**
-    * The framebuffer's visual. Immutable if this is a window system buffer.
-    * Computed from attachments if user-made FBO.
-    */
-   struct gl_config Visual;
-
-   GLboolean Initialized;
-
-   GLuint Width, Height;	/**< size of frame buffer in pixels */
-
-   /** \name  Drawing bounds (Intersection of buffer size and scissor box) */
-   /*@{*/
-   GLint _Xmin, _Xmax;  /**< inclusive */
-   GLint _Ymin, _Ymax;  /**< exclusive */
-   /*@}*/
-
-   /** \name  Derived Z buffer stuff */
-   /*@{*/
-   GLuint _DepthMax;	/**< Max depth buffer value */
-   GLfloat _DepthMaxF;	/**< Float max depth buffer value */
-   GLfloat _MRD;	/**< minimum resolvable difference in Z values */
-   /*@}*/
-
-   /** One of the GL_FRAMEBUFFER_(IN)COMPLETE_* tokens */
-   GLenum _Status;
-
-   /** Integer color values */
-   GLboolean _IntegerColor;
-
-   /** Array of all renderbuffer attachments, indexed by BUFFER_* tokens. */
-   struct gl_renderbuffer_attachment Attachment[BUFFER_COUNT];
-
-   /* In unextended OpenGL these vars are part of the GL_COLOR_BUFFER
-    * attribute group and GL_PIXEL attribute group, respectively.
-    */
-   GLenum ColorDrawBuffer[MAX_DRAW_BUFFERS];
-   GLenum ColorReadBuffer;
-
-   /** Computed from ColorDraw/ReadBuffer above */
-   GLuint _NumColorDrawBuffers;
-   GLint _ColorDrawBufferIndexes[MAX_DRAW_BUFFERS]; /**< BUFFER_x or -1 */
-   GLint _ColorReadBufferIndex; /* -1 = None */
-   struct gl_renderbuffer *_ColorDrawBuffers[MAX_DRAW_BUFFERS];
-   struct gl_renderbuffer *_ColorReadBuffer;
-
-   /** The Actual depth/stencil buffers to use.  May be wrappers around the
-    * depth/stencil buffers attached above. */
-   struct gl_renderbuffer *_DepthBuffer;
-   struct gl_renderbuffer *_StencilBuffer;
-
-   /** Delete this framebuffer */
-   void (*Delete)(struct gl_framebuffer *fb);
-};
-
-
-/**
- * Precision info for shader datatypes.  See glGetShaderPrecisionFormat().
- */
-struct gl_precision
-{
-   GLushort RangeMin;   /**< min value exponent */
-   GLushort RangeMax;   /**< max value exponent */
-   GLushort Precision;  /**< number of mantissa bits */
-};
-
-
-/**
- * Limits for vertex and fragment programs/shaders.
- */
-struct gl_program_constants
-{
-   /* logical limits */
-   GLuint MaxInstructions;
-   GLuint MaxAluInstructions;
-   GLuint MaxTexInstructions;
-   GLuint MaxTexIndirections;
-   GLuint MaxAttribs;
-   GLuint MaxTemps;
-   GLuint MaxAddressRegs;
-   GLuint MaxParameters;
-   GLuint MaxLocalParams;
-   GLuint MaxEnvParams;
-   /* native/hardware limits */
-   GLuint MaxNativeInstructions;
-   GLuint MaxNativeAluInstructions;
-   GLuint MaxNativeTexInstructions;
-   GLuint MaxNativeTexIndirections;
-   GLuint MaxNativeAttribs;
-   GLuint MaxNativeTemps;
-   GLuint MaxNativeAddressRegs;
-   GLuint MaxNativeParameters;
-   /* For shaders */
-   GLuint MaxUniformComponents;
-   /* GL_ARB_geometry_shader4 */
-   GLuint MaxGeometryTextureImageUnits;
-   GLuint MaxGeometryVaryingComponents;
-   GLuint MaxVertexVaryingComponents;
-   GLuint MaxGeometryUniformComponents;
-   GLuint MaxGeometryOutputVertices;
-   GLuint MaxGeometryTotalOutputComponents;
-   /* ES 2.0 and GL_ARB_ES2_compatibility */
-   struct gl_precision LowFloat, MediumFloat, HighFloat;
-   struct gl_precision LowInt, MediumInt, HighInt;
-};
-
-
-/**
- * Constants which may be overridden by device driver during context creation
- * but are never changed after that.
- */
-struct gl_constants
-{
-   GLint MaxTextureMbytes;      /**< Max memory per image, in MB */
-   GLint MaxTextureLevels;      /**< Max mipmap levels. */ 
-   GLint Max3DTextureLevels;    /**< Max mipmap levels for 3D textures */
-   GLint MaxCubeTextureLevels;  /**< Max mipmap levels for cube textures */
-   GLint MaxArrayTextureLayers; /**< Max layers in array textures */
-   GLint MaxTextureRectSize;    /**< Max rectangle texture size, in pixes */
-   GLuint MaxTextureCoordUnits;
-   GLuint MaxTextureImageUnits;
-   GLuint MaxVertexTextureImageUnits;
-   GLuint MaxCombinedTextureImageUnits;
-   GLuint MaxTextureUnits;           /**< = MIN(CoordUnits, ImageUnits) */
-   GLfloat MaxTextureMaxAnisotropy;  /**< GL_EXT_texture_filter_anisotropic */
-   GLfloat MaxTextureLodBias;        /**< GL_EXT_texture_lod_bias */
-
-   GLuint MaxArrayLockSize;
-
-   GLint SubPixelBits;
-
-   GLfloat MinPointSize, MaxPointSize;	     /**< aliased */
-   GLfloat MinPointSizeAA, MaxPointSizeAA;   /**< antialiased */
-   GLfloat PointSizeGranularity;
-   GLfloat MinLineWidth, MaxLineWidth;       /**< aliased */
-   GLfloat MinLineWidthAA, MaxLineWidthAA;   /**< antialiased */
-   GLfloat LineWidthGranularity;
-
-   GLuint MaxColorTableSize;
-
-   GLuint MaxClipPlanes;
-   GLuint MaxLights;
-   GLfloat MaxShininess;                     /**< GL_NV_light_max_exponent */
-   GLfloat MaxSpotExponent;                  /**< GL_NV_light_max_exponent */
-
-   GLuint MaxViewportWidth, MaxViewportHeight;
-
-   struct gl_program_constants VertexProgram;   /**< GL_ARB_vertex_program */
-   struct gl_program_constants FragmentProgram; /**< GL_ARB_fragment_program */
-   struct gl_program_constants GeometryProgram;  /**< GL_ARB_geometry_shader4 */
-   GLuint MaxProgramMatrices;
-   GLuint MaxProgramMatrixStackDepth;
-
-   /** vertex array / buffer object bounds checking */
-   GLboolean CheckArrayBounds;
-
-   GLuint MaxDrawBuffers;    /**< GL_ARB_draw_buffers */
-
-   GLuint MaxColorAttachments;   /**< GL_EXT_framebuffer_object */
-   GLuint MaxRenderbufferSize;   /**< GL_EXT_framebuffer_object */
-   GLuint MaxSamples;            /**< GL_ARB_framebuffer_object */
-
-   GLuint MaxVarying;  /**< Number of float[4] varying parameters */
-
-   GLuint GLSLVersion;  /**< GLSL version supported (ex: 120 = 1.20) */
-
-   /** Which texture units support GL_ATI_envmap_bumpmap as targets */
-   GLbitfield SupportedBumpUnits;
-
-   /**
-    * Maximum amount of time, measured in nanseconds, that the server can wait.
-    */
-   GLuint64 MaxServerWaitTimeout;
-
-   /** GL_EXT_provoking_vertex */
-   GLboolean QuadsFollowProvokingVertexConvention;
-
-   /** OpenGL version 3.0 */
-   GLbitfield ContextFlags;  /**< Ex: GL_CONTEXT_FLAG_FORWARD_COMPATIBLE_BIT */
-
-   /** OpenGL version 3.2 */
-   GLbitfield ProfileMask;   /**< Mask of CONTEXT_x_PROFILE_BIT */
-
-   /** GL_EXT_transform_feedback */
-   GLuint MaxTransformFeedbackSeparateAttribs;
-   GLuint MaxTransformFeedbackSeparateComponents;
-   GLuint MaxTransformFeedbackInterleavedComponents;
-
-   /** GL_EXT_gpu_shader4 */
-   GLint MinProgramTexelOffset, MaxProgramTexelOffset;
-
-   /* GL_EXT_framebuffer_sRGB */
-   GLboolean sRGBCapable; /* can enable sRGB blend/update on FBOs */
-};
-
-
-/**
- * Enable flag for each OpenGL extension.  Different device drivers will
- * enable different extensions at runtime.
- */
-struct gl_extensions
-{
-   GLboolean dummy;  /* don't remove this! */
-   GLboolean dummy_true;  /* Set true by _mesa_init_extensions(). */
-   GLboolean dummy_false; /* Set false by _mesa_init_extensions(). */
-   GLboolean ARB_ES2_compatibility;
-   GLboolean ARB_blend_func_extended;
-   GLboolean ARB_copy_buffer;
-   GLboolean ARB_depth_buffer_float;
-   GLboolean ARB_depth_clamp;
-   GLboolean ARB_depth_texture;
-   GLboolean ARB_draw_buffers;
-   GLboolean ARB_draw_buffers_blend;
-   GLboolean ARB_draw_elements_base_vertex;
-   GLboolean ARB_draw_instanced;
-   GLboolean ARB_fragment_coord_conventions;
-   GLboolean ARB_fragment_program;
-   GLboolean ARB_fragment_program_shadow;
-   GLboolean ARB_fragment_shader;
-   GLboolean ARB_framebuffer_object;
-   GLboolean ARB_explicit_attrib_location;
-   GLboolean ARB_geometry_shader4;
-   GLboolean ARB_half_float_pixel;
-   GLboolean ARB_half_float_vertex;
-   GLboolean ARB_instanced_arrays;
-   GLboolean ARB_map_buffer_range;
-   GLboolean ARB_multisample;
-   GLboolean ARB_multitexture;
-   GLboolean ARB_occlusion_query;
-   GLboolean ARB_occlusion_query2;
-   GLboolean ARB_point_sprite;
-   GLboolean ARB_sampler_objects;
-   GLboolean ARB_seamless_cube_map;
-   GLboolean ARB_shader_objects;
-   GLboolean ARB_shader_stencil_export;
-   GLboolean ARB_shading_language_100;
-   GLboolean ARB_shadow;
-   GLboolean ARB_shadow_ambient;
-   GLboolean ARB_sync;
-   GLboolean ARB_texture_border_clamp;
-   GLboolean ARB_texture_buffer_object;
-   GLboolean ARB_texture_compression;
-   GLboolean ARB_texture_compression_rgtc;
-   GLboolean ARB_texture_cube_map;
-   GLboolean ARB_texture_env_combine;
-   GLboolean ARB_texture_env_crossbar;
-   GLboolean ARB_texture_env_dot3;
-   GLboolean ARB_texture_float;
-   GLboolean ARB_texture_mirrored_repeat;
-   GLboolean ARB_texture_multisample;
-   GLboolean ARB_texture_non_power_of_two;
-   GLboolean ARB_texture_rg;
-   GLboolean ARB_texture_rgb10_a2ui;
-   GLboolean ARB_timer_query;
-   GLboolean ARB_transform_feedback2;
-   GLboolean ARB_transpose_matrix;
-   GLboolean ARB_uniform_buffer_object;
-   GLboolean ARB_vertex_array_object;
-   GLboolean ARB_vertex_buffer_object;
-   GLboolean ARB_vertex_program;
-   GLboolean ARB_vertex_shader;
-   GLboolean ARB_vertex_type_2_10_10_10_rev;
-   GLboolean ARB_window_pos;
-   GLboolean EXT_abgr;
-   GLboolean EXT_bgra;
-   GLboolean EXT_blend_color;
-   GLboolean EXT_blend_equation_separate;
-   GLboolean EXT_blend_func_separate;
-   GLboolean EXT_blend_logic_op;
-   GLboolean EXT_blend_minmax;
-   GLboolean EXT_blend_subtract;
-   GLboolean EXT_clip_volume_hint;
-   GLboolean EXT_compiled_vertex_array;
-   GLboolean EXT_copy_texture;
-   GLboolean EXT_depth_bounds_test;
-   GLboolean EXT_draw_buffers2;
-   GLboolean EXT_draw_range_elements;
-   GLboolean EXT_fog_coord;
-   GLboolean EXT_framebuffer_blit;
-   GLboolean EXT_framebuffer_multisample;
-   GLboolean EXT_framebuffer_object;
-   GLboolean EXT_framebuffer_sRGB;
-   GLboolean EXT_gpu_program_parameters;
-   GLboolean EXT_gpu_shader4;
-   GLboolean EXT_multi_draw_arrays;
-   GLboolean EXT_paletted_texture;
-   GLboolean EXT_packed_depth_stencil;
-   GLboolean EXT_packed_float;
-   GLboolean EXT_packed_pixels;
-   GLboolean EXT_pixel_buffer_object;
-   GLboolean EXT_point_parameters;
-   GLboolean EXT_polygon_offset;
-   GLboolean EXT_provoking_vertex;
-   GLboolean EXT_rescale_normal;
-   GLboolean EXT_shadow_funcs;
-   GLboolean EXT_secondary_color;
-   GLboolean EXT_separate_shader_objects;
-   GLboolean EXT_separate_specular_color;
-   GLboolean EXT_shared_texture_palette;
-   GLboolean EXT_stencil_wrap;
-   GLboolean EXT_stencil_two_side;
-   GLboolean EXT_subtexture;
-   GLboolean EXT_texture;
-   GLboolean EXT_texture_object;
-   GLboolean EXT_texture3D;
-   GLboolean EXT_texture_array;
-   GLboolean EXT_texture_compression_s3tc;
-   GLboolean EXT_texture_env_add;
-   GLboolean EXT_texture_env_combine;
-   GLboolean EXT_texture_env_dot3;
-   GLboolean EXT_texture_filter_anisotropic;
-   GLboolean EXT_texture_integer;
-   GLboolean EXT_texture_lod_bias;
-   GLboolean EXT_texture_mirror_clamp;
-   GLboolean EXT_texture_shared_exponent;
-   GLboolean EXT_texture_sRGB;
-   GLboolean EXT_texture_sRGB_decode;
-   GLboolean EXT_texture_swizzle;
-   GLboolean EXT_transform_feedback;
-   GLboolean EXT_timer_query;
-   GLboolean EXT_vertex_array;
-   GLboolean EXT_vertex_array_bgra;
-   GLboolean EXT_vertex_array_set;
-   GLboolean OES_standard_derivatives;
-   /* vendor extensions */
-   GLboolean AMD_conservative_depth;
-   GLboolean APPLE_client_storage;
-   GLboolean APPLE_packed_pixels;
-   GLboolean APPLE_vertex_array_object;
-   GLboolean APPLE_object_purgeable;
-   GLboolean ATI_envmap_bumpmap;
-   GLboolean ATI_texture_mirror_once;
-   GLboolean ATI_texture_env_combine3;
-   GLboolean ATI_fragment_shader;
-   GLboolean ATI_separate_stencil;
-   GLboolean IBM_rasterpos_clip;
-   GLboolean IBM_multimode_draw_arrays;
-   GLboolean MESA_pack_invert;
-   GLboolean MESA_resize_buffers;
-   GLboolean MESA_ycbcr_texture;
-   GLboolean MESA_texture_array;
-   GLboolean MESA_texture_signed_rgba;
-   GLboolean NV_blend_square;
-   GLboolean NV_conditional_render;
-   GLboolean NV_fragment_program;
-   GLboolean NV_fragment_program_option;
-   GLboolean NV_light_max_exponent;
-   GLboolean NV_point_sprite;
-   GLboolean NV_primitive_restart;
-   GLboolean NV_texgen_reflection;
-   GLboolean NV_texture_env_combine4;
-   GLboolean NV_texture_rectangle;
-   GLboolean NV_vertex_program;
-   GLboolean NV_vertex_program1_1;
-   GLboolean OES_read_format;
-   GLboolean SGI_texture_color_table;
-   GLboolean SGIS_generate_mipmap;
-   GLboolean SGIS_texture_edge_clamp;
-   GLboolean SGIS_texture_lod;
-   GLboolean TDFX_texture_compression_FXT1;
-   GLboolean S3_s3tc;
-   GLboolean OES_EGL_image;
-   GLboolean OES_draw_texture;
-   GLboolean EXT_texture_format_BGRA8888;
-   GLboolean extension_sentinel;
-   /** The extension string */
-   const GLubyte *String;
-   /** Number of supported extensions */
-   GLuint Count;
-};
-
-
-/**
- * A stack of matrices (projection, modelview, color, texture, etc).
- */
-struct gl_matrix_stack
-{
-   GLmatrix *Top;      /**< points into Stack */
-   GLmatrix *Stack;    /**< array [MaxDepth] of GLmatrix */
-   GLuint Depth;       /**< 0 <= Depth < MaxDepth */
-   GLuint MaxDepth;    /**< size of Stack[] array */
-   GLuint DirtyFlag;   /**< _NEW_MODELVIEW or _NEW_PROJECTION, for example */
-};
-
-
-/**
- * \name Bits for image transfer operations 
- * \sa __struct gl_contextRec::ImageTransferState.
- */
-/*@{*/
-#define IMAGE_SCALE_BIAS_BIT                      0x1
-#define IMAGE_SHIFT_OFFSET_BIT                    0x2
-#define IMAGE_MAP_COLOR_BIT                       0x4
-#define IMAGE_CLAMP_BIT                           0x800
-
-
-/** Pixel Transfer ops */
-#define IMAGE_BITS (IMAGE_SCALE_BIAS_BIT |			\
-		    IMAGE_SHIFT_OFFSET_BIT |			\
-		    IMAGE_MAP_COLOR_BIT)
-
-/**
- * \name Bits to indicate what state has changed.  
- */
-/*@{*/
-#define _NEW_MODELVIEW         (1 << 0)   /**< gl_context::ModelView */
-#define _NEW_PROJECTION        (1 << 1)   /**< gl_context::Projection */
-#define _NEW_TEXTURE_MATRIX    (1 << 2)   /**< gl_context::TextureMatrix */
-#define _NEW_COLOR             (1 << 3)   /**< gl_context::Color */
-#define _NEW_DEPTH             (1 << 4)   /**< gl_context::Depth */
-#define _NEW_EVAL              (1 << 5)   /**< gl_context::Eval, EvalMap */
-#define _NEW_FOG               (1 << 6)   /**< gl_context::Fog */
-#define _NEW_HINT              (1 << 7)   /**< gl_context::Hint */
-#define _NEW_LIGHT             (1 << 8)   /**< gl_context::Light */
-#define _NEW_LINE              (1 << 9)   /**< gl_context::Line */
-#define _NEW_PIXEL             (1 << 10)  /**< gl_context::Pixel */
-#define _NEW_POINT             (1 << 11)  /**< gl_context::Point */
-#define _NEW_POLYGON           (1 << 12)  /**< gl_context::Polygon */
-#define _NEW_POLYGONSTIPPLE    (1 << 13)  /**< gl_context::PolygonStipple */
-#define _NEW_SCISSOR           (1 << 14)  /**< gl_context::Scissor */
-#define _NEW_STENCIL           (1 << 15)  /**< gl_context::Stencil */
-#define _NEW_TEXTURE           (1 << 16)  /**< gl_context::Texture */
-#define _NEW_TRANSFORM         (1 << 17)  /**< gl_context::Transform */
-#define _NEW_VIEWPORT          (1 << 18)  /**< gl_context::Viewport */
-#define _NEW_PACKUNPACK        (1 << 19)  /**< gl_context::Pack, Unpack */
-#define _NEW_ARRAY             (1 << 20)  /**< gl_context::Array */
-#define _NEW_RENDERMODE        (1 << 21)  /**< gl_context::RenderMode, etc */
-#define _NEW_BUFFERS           (1 << 22)  /**< gl_context::Visual, DrawBuffer, */
-#define _NEW_CURRENT_ATTRIB    (1 << 23)  /**< gl_context::Current */
-#define _NEW_MULTISAMPLE       (1 << 24)  /**< gl_context::Multisample */
-#define _NEW_TRACK_MATRIX      (1 << 25)  /**< gl_context::VertexProgram */
-#define _NEW_PROGRAM           (1 << 26)  /**< New program/shader state */
-#define _NEW_PROGRAM_CONSTANTS (1 << 27)
-#define _NEW_BUFFER_OBJECT     (1 << 28)
-#define _NEW_ALL ~0
-/*@}*/
-
-
-/**
- * \name Bits to track array state changes 
- *
- * Also used to summarize array enabled.
- */
-/*@{*/
-#define _NEW_ARRAY_VERTEX           VERT_BIT_POS
-#define _NEW_ARRAY_WEIGHT           VERT_BIT_WEIGHT
-#define _NEW_ARRAY_NORMAL           VERT_BIT_NORMAL
-#define _NEW_ARRAY_COLOR0           VERT_BIT_COLOR0
-#define _NEW_ARRAY_COLOR1           VERT_BIT_COLOR1
-#define _NEW_ARRAY_FOGCOORD         VERT_BIT_FOG
-#define _NEW_ARRAY_INDEX            VERT_BIT_COLOR_INDEX
-#define _NEW_ARRAY_EDGEFLAG         VERT_BIT_EDGEFLAG
-#define _NEW_ARRAY_POINT_SIZE       VERT_BIT_COLOR_INDEX  /* aliased */
-#define _NEW_ARRAY_TEXCOORD_0       VERT_BIT_TEX0
-#define _NEW_ARRAY_TEXCOORD_1       VERT_BIT_TEX1
-#define _NEW_ARRAY_TEXCOORD_2       VERT_BIT_TEX2
-#define _NEW_ARRAY_TEXCOORD_3       VERT_BIT_TEX3
-#define _NEW_ARRAY_TEXCOORD_4       VERT_BIT_TEX4
-#define _NEW_ARRAY_TEXCOORD_5       VERT_BIT_TEX5
-#define _NEW_ARRAY_TEXCOORD_6       VERT_BIT_TEX6
-#define _NEW_ARRAY_TEXCOORD_7       VERT_BIT_TEX7
-#define _NEW_ARRAY_ATTRIB_0         VERT_BIT_GENERIC0  /* start at bit 16 */
-#define _NEW_ARRAY_ALL              0xffffffff
-
-
-#define _NEW_ARRAY_TEXCOORD(i) (_NEW_ARRAY_TEXCOORD_0 << (i))
-#define _NEW_ARRAY_ATTRIB(i) (_NEW_ARRAY_ATTRIB_0 << (i))
-/*@}*/
-
-
-
-/**
- * \name A bunch of flags that we think might be useful to drivers.
- * 
- * Set in the __struct gl_contextRec::_TriangleCaps bitfield.
- */
-/*@{*/
-#define DD_FLATSHADE                0x1
-#define DD_SEPARATE_SPECULAR        0x2
-#define DD_TRI_CULL_FRONT_BACK      0x4 /* special case on some hw */
-#define DD_TRI_LIGHT_TWOSIDE        0x8
-#define DD_TRI_UNFILLED             0x10
-#define DD_TRI_SMOOTH               0x20
-#define DD_TRI_STIPPLE              0x40
-#define DD_TRI_OFFSET               0x80
-#define DD_LINE_SMOOTH              0x100
-#define DD_LINE_STIPPLE             0x200
-#define DD_POINT_SMOOTH             0x400
-#define DD_POINT_ATTEN              0x800
-#define DD_TRI_TWOSTENCIL           0x1000
-/*@}*/
-
-
-/**
- * \name Define the state changes under which each of these bits might change
- */
-/*@{*/
-#define _DD_NEW_FLATSHADE                _NEW_LIGHT
-#define _DD_NEW_SEPARATE_SPECULAR        (_NEW_LIGHT | _NEW_FOG | _NEW_PROGRAM)
-#define _DD_NEW_TRI_CULL_FRONT_BACK      _NEW_POLYGON
-#define _DD_NEW_TRI_LIGHT_TWOSIDE        _NEW_LIGHT
-#define _DD_NEW_TRI_UNFILLED             _NEW_POLYGON
-#define _DD_NEW_TRI_SMOOTH               _NEW_POLYGON
-#define _DD_NEW_TRI_STIPPLE              _NEW_POLYGON
-#define _DD_NEW_TRI_OFFSET               _NEW_POLYGON
-#define _DD_NEW_LINE_SMOOTH              _NEW_LINE
-#define _DD_NEW_LINE_STIPPLE             _NEW_LINE
-#define _DD_NEW_LINE_WIDTH               _NEW_LINE
-#define _DD_NEW_POINT_SMOOTH             _NEW_POINT
-#define _DD_NEW_POINT_SIZE               _NEW_POINT
-#define _DD_NEW_POINT_ATTEN              _NEW_POINT
-/*@}*/
-
-
-/**
- * Composite state flags
- */
-/*@{*/
-#define _MESA_NEW_NEED_EYE_COORDS         (_NEW_LIGHT |		\
-                                           _NEW_TEXTURE |	\
-                                           _NEW_POINT |		\
-                                           _NEW_PROGRAM |	\
-                                           _NEW_MODELVIEW)
-
-#define _MESA_NEW_NEED_NORMALS            (_NEW_LIGHT |		\
-                                           _NEW_TEXTURE)
-
-#define _MESA_NEW_TRANSFER_STATE          (_NEW_PIXEL)
-/*@}*/
-
-
-
-
-/* This has to be included here. */
-#include "dd.h"
-
-
-/**
- * Display list flags.
- * Strictly this is a tnl-private concept, but it doesn't seem
- * worthwhile adding a tnl private structure just to hold this one bit
- * of information:
- */
-#define DLIST_DANGLING_REFS     0x1 
-
-
-/** Opaque declaration of display list payload data type */
-union gl_dlist_node;
-
-
-/**
- * Provide a location where information about a display list can be
- * collected.  Could be extended with driverPrivate structures,
- * etc. in the future.
- */
-struct gl_display_list
-{
-   GLuint Name;
-   GLbitfield Flags;  /**< DLIST_x flags */
-   /** The dlist commands are in a linked list of nodes */
-   union gl_dlist_node *Head;
-};
-
-
-/**
- * State used during display list compilation and execution.
- */
-struct gl_dlist_state
-{
-   GLuint CallDepth;		/**< Current recursion calling depth */
-
-   struct gl_display_list *CurrentList; /**< List currently being compiled */
-   union gl_dlist_node *CurrentBlock; /**< Pointer to current block of nodes */
-   GLuint CurrentPos;		/**< Index into current block of nodes */
-
-   GLvertexformat ListVtxfmt;
-
-   GLubyte ActiveAttribSize[VERT_ATTRIB_MAX];
-   GLfloat CurrentAttrib[VERT_ATTRIB_MAX][4];
-   
-   GLubyte ActiveMaterialSize[MAT_ATTRIB_MAX];
-   GLfloat CurrentMaterial[MAT_ATTRIB_MAX][4];
-
-   GLubyte ActiveIndex;
-   GLfloat CurrentIndex;
-   
-   GLubyte ActiveEdgeFlag;
-   GLboolean CurrentEdgeFlag;
-
-   struct {
-      /* State known to have been set by the currently-compiling display
-       * list.  Used to eliminate some redundant state changes.
-       */
-      GLenum ShadeModel;
-   } Current;
-};
-
-
-/**
- * Enum for the OpenGL APIs we know about and may support.
- */
-typedef enum
-{
-   API_OPENGL,
-   API_OPENGLES,
-   API_OPENGLES2
-} gl_api;
-
-
-/**
- * Mesa rendering context.
- *
- * This is the central context data structure for Mesa.  Almost all
- * OpenGL state is contained in this structure.
- * Think of this as a base class from which device drivers will derive
- * sub classes.
- *
- * The struct gl_context typedef names this structure.
- */
-struct gl_context
-{
-   /** State possibly shared with other contexts in the address space */
-   struct gl_shared_state *Shared;
-
-   /** \name API function pointer tables */
-   /*@{*/
-   gl_api API;
-   struct _glapi_table *Save;	/**< Display list save functions */
-   struct _glapi_table *Exec;	/**< Execute functions */
-   struct _glapi_table *CurrentDispatch;  /**< == Save or Exec !! */
-   /*@}*/
-
-   struct gl_config Visual;
-   struct gl_framebuffer *DrawBuffer;	/**< buffer for writing */
-   struct gl_framebuffer *ReadBuffer;	/**< buffer for reading */
-   struct gl_framebuffer *WinSysDrawBuffer;  /**< set with MakeCurrent */
-   struct gl_framebuffer *WinSysReadBuffer;  /**< set with MakeCurrent */
-
-   /**
-    * Device driver function pointer table
-    */
-   struct dd_function_table Driver;
-
-   void *DriverCtx;	/**< Points to device driver context/state */
-
-   /** Core/Driver constants */
-   struct gl_constants Const;
-
-   /** \name The various 4x4 matrix stacks */
-   /*@{*/
-   struct gl_matrix_stack ModelviewMatrixStack;
-   struct gl_matrix_stack ProjectionMatrixStack;
-   struct gl_matrix_stack TextureMatrixStack[MAX_TEXTURE_UNITS];
-   struct gl_matrix_stack ProgramMatrixStack[MAX_PROGRAM_MATRICES];
-   struct gl_matrix_stack *CurrentStack; /**< Points to one of the above stacks */
-   /*@}*/
-
-   /** Combined modelview and projection matrix */
-   GLmatrix _ModelProjectMatrix;
-
-   /** \name Display lists */
-   struct gl_dlist_state ListState;
-
-   GLboolean ExecuteFlag;	/**< Execute GL commands? */
-   GLboolean CompileFlag;	/**< Compile GL commands into display list? */
-
-   /** Extension information */
-   struct gl_extensions Extensions;
-
-   /** Version info */
-   GLuint VersionMajor, VersionMinor;
-   char *VersionString;
-
-   /** \name State attribute stack (for glPush/PopAttrib) */
-   /*@{*/
-   GLuint AttribStackDepth;
-   struct gl_attrib_node *AttribStack[MAX_ATTRIB_STACK_DEPTH];
-   /*@}*/
-
-   /** \name Renderer attribute groups
-    * 
-    * We define a struct for each attribute group to make pushing and popping
-    * attributes easy.  Also it's a good organization.
-    */
-   /*@{*/
-   struct gl_accum_attrib	Accum;		/**< Accum buffer attributes */
-   struct gl_colorbuffer_attrib	Color;		/**< Color buffer attributes */
-   struct gl_current_attrib	Current;	/**< Current attributes */
-   struct gl_depthbuffer_attrib	Depth;		/**< Depth buffer attributes */
-   struct gl_eval_attrib	Eval;		/**< Eval attributes */
-   struct gl_fog_attrib		Fog;		/**< Fog attributes */
-   struct gl_hint_attrib	Hint;		/**< Hint attributes */
-   struct gl_light_attrib	Light;		/**< Light attributes */
-   struct gl_line_attrib	Line;		/**< Line attributes */
-   struct gl_list_attrib	List;		/**< List attributes */
-   struct gl_multisample_attrib Multisample;
-   struct gl_pixel_attrib	Pixel;		/**< Pixel attributes */
-   struct gl_point_attrib	Point;		/**< Point attributes */
-   struct gl_polygon_attrib	Polygon;	/**< Polygon attributes */
-   GLuint PolygonStipple[32];			/**< Polygon stipple */
-   struct gl_scissor_attrib	Scissor;	/**< Scissor attributes */
-   struct gl_stencil_attrib	Stencil;	/**< Stencil buffer attributes */
-   struct gl_texture_attrib	Texture;	/**< Texture attributes */
-   struct gl_transform_attrib	Transform;	/**< Transformation attributes */
-   struct gl_viewport_attrib	Viewport;	/**< Viewport attributes */
-   /*@}*/
-
-   /** \name Client attribute stack */
-   /*@{*/
-   GLuint ClientAttribStackDepth;
-   struct gl_attrib_node *ClientAttribStack[MAX_CLIENT_ATTRIB_STACK_DEPTH];
-   /*@}*/
-
-   /** \name Client attribute groups */
-   /*@{*/
-   struct gl_array_attrib	Array;	/**< Vertex arrays */
-   struct gl_pixelstore_attrib	Pack;	/**< Pixel packing */
-   struct gl_pixelstore_attrib	Unpack;	/**< Pixel unpacking */
-   struct gl_pixelstore_attrib	DefaultPacking;	/**< Default params */
-   /*@}*/
-
-   /** \name Other assorted state (not pushed/popped on attribute stack) */
-   /*@{*/
-   struct gl_pixelmaps          PixelMaps;
-
-   struct gl_evaluators EvalMap;   /**< All evaluators */
-   struct gl_feedback   Feedback;  /**< Feedback */
-   struct gl_selection  Select;    /**< Selection */
-
-   struct gl_program_state Program;  /**< general program state */
-   struct gl_vertex_program_state VertexProgram;
-   struct gl_fragment_program_state FragmentProgram;
-   struct gl_geometry_program_state GeometryProgram;
-   struct gl_ati_fragment_shader_state ATIFragmentShader;
-
-   struct gl_shader_state Shader; /**< GLSL shader object state */
-   struct gl_shader_compiler_options ShaderCompilerOptions[MESA_SHADER_TYPES];
-
-   struct gl_query_state Query;  /**< occlusion, timer queries */
-
-   struct gl_transform_feedback TransformFeedback;
-
-   struct gl_buffer_object *CopyReadBuffer; /**< GL_ARB_copy_buffer */
-   struct gl_buffer_object *CopyWriteBuffer; /**< GL_ARB_copy_buffer */
-   /*@}*/
-
-   struct gl_meta_state *Meta;  /**< for "meta" operations */
-
-   /* GL_EXT_framebuffer_object */
-   struct gl_renderbuffer *CurrentRenderbuffer;
-
-   GLenum ErrorValue;        /**< Last error code */
-
-   /**
-    * Recognize and silence repeated error debug messages in buggy apps.
-    */
-   const char *ErrorDebugFmtString;
-   GLuint ErrorDebugCount;
-
-   GLenum RenderMode;        /**< either GL_RENDER, GL_SELECT, GL_FEEDBACK */
-   GLbitfield NewState;      /**< bitwise-or of _NEW_* flags */
-
-   GLboolean ViewportInitialized;  /**< has viewport size been initialized? */
-
-   GLbitfield varying_vp_inputs;  /**< mask of VERT_BIT_* flags */
-
-   /** \name Derived state */
-   /*@{*/
-   /** Bitwise-or of DD_* flags.  Note that this bitfield may be used before
-    * state validation so they need to always be current.
-    */
-   GLbitfield _TriangleCaps;
-   GLbitfield _ImageTransferState;/**< bitwise-or of IMAGE_*_BIT flags */
-   GLfloat _EyeZDir[3];
-   GLfloat _ModelViewInvScale;
-   GLboolean _NeedEyeCoords;
-   GLboolean _ForceEyeCoords; 
-
-   GLuint TextureStateTimestamp; /**< detect changes to shared state */
-
-   struct gl_shine_tab *_ShineTable[2]; /**< Active shine tables */
-   struct gl_shine_tab *_ShineTabList;  /**< MRU list of inactive shine tables */
-   /**@}*/
-
-   struct gl_list_extensions *ListExt; /**< driver dlist extensions */
-
-   /** \name For debugging/development only */
-   /*@{*/
-   GLboolean FirstTimeCurrent;
-   /*@}*/
-
-   /** Dither disable via MESA_NO_DITHER env var */
-   GLboolean NoDither;
-
-   /** software compression/decompression supported or not */
-   GLboolean Mesa_DXTn;
-
-   GLboolean TextureFormatSupported[MESA_FORMAT_COUNT];
-
-   /** 
-    * Use dp4 (rather than mul/mad) instructions for position
-    * transformation?
-    */
-   GLboolean mvp_with_dp4;
-
-   /**
-    * \name Hooks for module contexts.  
-    *
-    * These will eventually live in the driver or elsewhere.
-    */
-   /*@{*/
-   void *swrast_context;
-   void *swsetup_context;
-   void *swtnl_context;
-   void *swtnl_im;
-   struct st_context *st;
-   void *aelt_context;
-   /*@}*/
-};
-
-
-#ifdef DEBUG
-extern int MESA_VERBOSE;
-extern int MESA_DEBUG_FLAGS;
-# define MESA_FUNCTION __FUNCTION__
-#else
-# define MESA_VERBOSE 0
-# define MESA_DEBUG_FLAGS 0
-# define MESA_FUNCTION "a function"
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-
-
-/** The MESA_VERBOSE var is a bitmask of these flags */
-enum _verbose
-{
-   VERBOSE_VARRAY		= 0x0001,
-   VERBOSE_TEXTURE		= 0x0002,
-   VERBOSE_MATERIAL		= 0x0004,
-   VERBOSE_PIPELINE		= 0x0008,
-   VERBOSE_DRIVER		= 0x0010,
-   VERBOSE_STATE		= 0x0020,
-   VERBOSE_API			= 0x0040,
-   VERBOSE_DISPLAY_LIST		= 0x0100,
-   VERBOSE_LIGHTING		= 0x0200,
-   VERBOSE_PRIMS		= 0x0400,
-   VERBOSE_VERTS		= 0x0800,
-   VERBOSE_DISASSEM		= 0x1000,
-   VERBOSE_DRAW                 = 0x2000,
-   VERBOSE_SWAPBUFFERS          = 0x4000
-};
-
-
-/** The MESA_DEBUG_FLAGS var is a bitmask of these flags */
-enum _debug
-{
-   DEBUG_ALWAYS_FLUSH		= 0x1
-};
-
-
-
-#endif /* MTYPES_H */
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.7
+ *
+ * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
+ * Copyright (C) 2009  VMware, Inc.  All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file mtypes.h
+ * Main Mesa data structures.
+ *
+ * Please try to mark derived values with a leading underscore ('_').
+ */
+
+#ifndef MTYPES_H
+#define MTYPES_H
+
+
+#include "main/glheader.h"
+#include "main/config.h"
+#include "main/mfeatures.h"
+#include "glapi/glapi.h"
+#include "math/m_matrix.h"	/* GLmatrix */
+#include "main/simple_list.h"	/* struct simple_node */
+#include "main/formats.h"       /* MESA_FORMAT_COUNT */
+
+
+/**
+ * Color channel data type.
+ */
+#if CHAN_BITS == 8
+   typedef GLubyte GLchan;
+#define CHAN_MAX 255
+#define CHAN_MAXF 255.0F
+#define CHAN_TYPE GL_UNSIGNED_BYTE
+#elif CHAN_BITS == 16
+   typedef GLushort GLchan;
+#define CHAN_MAX 65535
+#define CHAN_MAXF 65535.0F
+#define CHAN_TYPE GL_UNSIGNED_SHORT
+#elif CHAN_BITS == 32
+   typedef GLfloat GLchan;
+#define CHAN_MAX 1.0
+#define CHAN_MAXF 1.0F
+#define CHAN_TYPE GL_FLOAT
+#else
+#error "illegal number of color channel bits"
+#endif
+
+
+/**
+ * Stencil buffer data type.
+ */
+#if STENCIL_BITS==8
+   typedef GLubyte GLstencil;
+#elif STENCIL_BITS==16
+   typedef GLushort GLstencil;
+#else
+#  error "illegal number of stencil bits"
+#endif
+
+
+/**
+ * \name 64-bit extension of GLbitfield.
+ */
+/*@{*/
+typedef GLuint64 GLbitfield64;
+
+/** Set a single bit */
+#define BITFIELD64_BIT(b)      (1ULL << (b))
+
+
+/**
+ * \name Some forward type declarations
+ */
+/*@{*/
+struct _mesa_HashTable;
+struct gl_attrib_node;
+struct gl_list_extensions;
+struct gl_meta_state;
+struct gl_pixelstore_attrib;
+struct gl_program_cache;
+struct gl_texture_format;
+struct gl_texture_image;
+struct gl_texture_object;
+struct gl_context;
+struct st_context;
+/*@}*/
+
+
+/** Extra draw modes beyond GL_POINTS, GL_TRIANGLE_FAN, etc */
+#define PRIM_OUTSIDE_BEGIN_END   (GL_POLYGON+1)
+#define PRIM_INSIDE_UNKNOWN_PRIM (GL_POLYGON+2)
+#define PRIM_UNKNOWN             (GL_POLYGON+3)
+
+
+/**
+ * Shader stages. Note that these will become 5 with tessellation.
+ * These MUST have the same values as gallium's PIPE_SHADER_*
+ */
+typedef enum
+{
+   MESA_SHADER_VERTEX = 0,
+   MESA_SHADER_FRAGMENT = 1,
+   MESA_SHADER_GEOMETRY = 2,
+   MESA_SHADER_TYPES = 3
+} gl_shader_type;
+
+
+
+/**
+ * Indexes for vertex program attributes.
+ * GL_NV_vertex_program aliases generic attributes over the conventional
+ * attributes.  In GL_ARB_vertex_program shader the aliasing is optional.
+ * In GL_ARB_vertex_shader / OpenGL 2.0 the aliasing is disallowed (the
+ * generic attributes are distinct/separate).
+ */
+typedef enum
+{
+   VERT_ATTRIB_POS = 0,
+   VERT_ATTRIB_WEIGHT = 1,
+   VERT_ATTRIB_NORMAL = 2,
+   VERT_ATTRIB_COLOR0 = 3,
+   VERT_ATTRIB_COLOR1 = 4,
+   VERT_ATTRIB_FOG = 5,
+   VERT_ATTRIB_COLOR_INDEX = 6,
+   VERT_ATTRIB_POINT_SIZE = 6,  /*alias*/
+   VERT_ATTRIB_EDGEFLAG = 7,
+   VERT_ATTRIB_TEX0 = 8,
+   VERT_ATTRIB_TEX1 = 9,
+   VERT_ATTRIB_TEX2 = 10,
+   VERT_ATTRIB_TEX3 = 11,
+   VERT_ATTRIB_TEX4 = 12,
+   VERT_ATTRIB_TEX5 = 13,
+   VERT_ATTRIB_TEX6 = 14,
+   VERT_ATTRIB_TEX7 = 15,
+   VERT_ATTRIB_GENERIC0 = 16,
+   VERT_ATTRIB_GENERIC1 = 17,
+   VERT_ATTRIB_GENERIC2 = 18,
+   VERT_ATTRIB_GENERIC3 = 19,
+   VERT_ATTRIB_GENERIC4 = 20,
+   VERT_ATTRIB_GENERIC5 = 21,
+   VERT_ATTRIB_GENERIC6 = 22,
+   VERT_ATTRIB_GENERIC7 = 23,
+   VERT_ATTRIB_GENERIC8 = 24,
+   VERT_ATTRIB_GENERIC9 = 25,
+   VERT_ATTRIB_GENERIC10 = 26,
+   VERT_ATTRIB_GENERIC11 = 27,
+   VERT_ATTRIB_GENERIC12 = 28,
+   VERT_ATTRIB_GENERIC13 = 29,
+   VERT_ATTRIB_GENERIC14 = 30,
+   VERT_ATTRIB_GENERIC15 = 31,
+   VERT_ATTRIB_MAX = 32
+} gl_vert_attrib;
+
+/**
+ * Bitflags for vertex attributes.
+ * These are used in bitfields in many places.
+ */
+/*@{*/
+#define VERT_BIT_POS         (1 << VERT_ATTRIB_POS)
+#define VERT_BIT_WEIGHT      (1 << VERT_ATTRIB_WEIGHT)
+#define VERT_BIT_NORMAL      (1 << VERT_ATTRIB_NORMAL)
+#define VERT_BIT_COLOR0      (1 << VERT_ATTRIB_COLOR0)
+#define VERT_BIT_COLOR1      (1 << VERT_ATTRIB_COLOR1)
+#define VERT_BIT_FOG         (1 << VERT_ATTRIB_FOG)
+#define VERT_BIT_COLOR_INDEX (1 << VERT_ATTRIB_COLOR_INDEX)
+#define VERT_BIT_EDGEFLAG    (1 << VERT_ATTRIB_EDGEFLAG)
+#define VERT_BIT_TEX0        (1 << VERT_ATTRIB_TEX0)
+#define VERT_BIT_TEX1        (1 << VERT_ATTRIB_TEX1)
+#define VERT_BIT_TEX2        (1 << VERT_ATTRIB_TEX2)
+#define VERT_BIT_TEX3        (1 << VERT_ATTRIB_TEX3)
+#define VERT_BIT_TEX4        (1 << VERT_ATTRIB_TEX4)
+#define VERT_BIT_TEX5        (1 << VERT_ATTRIB_TEX5)
+#define VERT_BIT_TEX6        (1 << VERT_ATTRIB_TEX6)
+#define VERT_BIT_TEX7        (1 << VERT_ATTRIB_TEX7)
+#define VERT_BIT_GENERIC0    (1 << VERT_ATTRIB_GENERIC0)
+#define VERT_BIT_GENERIC1    (1 << VERT_ATTRIB_GENERIC1)
+#define VERT_BIT_GENERIC2    (1 << VERT_ATTRIB_GENERIC2)
+#define VERT_BIT_GENERIC3    (1 << VERT_ATTRIB_GENERIC3)
+#define VERT_BIT_GENERIC4    (1 << VERT_ATTRIB_GENERIC4)
+#define VERT_BIT_GENERIC5    (1 << VERT_ATTRIB_GENERIC5)
+#define VERT_BIT_GENERIC6    (1 << VERT_ATTRIB_GENERIC6)
+#define VERT_BIT_GENERIC7    (1 << VERT_ATTRIB_GENERIC7)
+#define VERT_BIT_GENERIC8    (1 << VERT_ATTRIB_GENERIC8)
+#define VERT_BIT_GENERIC9    (1 << VERT_ATTRIB_GENERIC9)
+#define VERT_BIT_GENERIC10   (1 << VERT_ATTRIB_GENERIC10)
+#define VERT_BIT_GENERIC11   (1 << VERT_ATTRIB_GENERIC11)
+#define VERT_BIT_GENERIC12   (1 << VERT_ATTRIB_GENERIC12)
+#define VERT_BIT_GENERIC13   (1 << VERT_ATTRIB_GENERIC13)
+#define VERT_BIT_GENERIC14   (1 << VERT_ATTRIB_GENERIC14)
+#define VERT_BIT_GENERIC15   (1 << VERT_ATTRIB_GENERIC15)
+
+#define VERT_BIT_TEX(u)  (1 << (VERT_ATTRIB_TEX0 + (u)))
+#define VERT_BIT_GENERIC(g)  (1 << (VERT_ATTRIB_GENERIC0 + (g)))
+/*@}*/
+
+
+/**
+ * Indexes for vertex program result attributes
+ */
+typedef enum
+{
+   VERT_RESULT_HPOS = 0,
+   VERT_RESULT_COL0 = 1,
+   VERT_RESULT_COL1 = 2,
+   VERT_RESULT_FOGC = 3,
+   VERT_RESULT_TEX0 = 4,
+   VERT_RESULT_TEX1 = 5,
+   VERT_RESULT_TEX2 = 6,
+   VERT_RESULT_TEX3 = 7,
+   VERT_RESULT_TEX4 = 8,
+   VERT_RESULT_TEX5 = 9,
+   VERT_RESULT_TEX6 = 10,
+   VERT_RESULT_TEX7 = 11,
+   VERT_RESULT_PSIZ = 12,
+   VERT_RESULT_BFC0 = 13,
+   VERT_RESULT_BFC1 = 14,
+   VERT_RESULT_EDGE = 15,
+   VERT_RESULT_VAR0 = 16,  /**< shader varying */
+   VERT_RESULT_MAX = (VERT_RESULT_VAR0 + MAX_VARYING)
+} gl_vert_result;
+
+
+/*********************************************/
+
+/**
+ * Indexes for geometry program attributes.
+ */
+typedef enum
+{
+   GEOM_ATTRIB_POSITION = 0,
+   GEOM_ATTRIB_COLOR0 = 1,
+   GEOM_ATTRIB_COLOR1 = 2,
+   GEOM_ATTRIB_SECONDARY_COLOR0 = 3,
+   GEOM_ATTRIB_SECONDARY_COLOR1 = 4,
+   GEOM_ATTRIB_FOG_FRAG_COORD = 5,
+   GEOM_ATTRIB_POINT_SIZE = 6,
+   GEOM_ATTRIB_CLIP_VERTEX = 7,
+   GEOM_ATTRIB_PRIMITIVE_ID = 8,
+   GEOM_ATTRIB_TEX_COORD = 9,
+
+   GEOM_ATTRIB_VAR0 = 16,
+   GEOM_ATTRIB_MAX = (GEOM_ATTRIB_VAR0 + MAX_VARYING)
+} gl_geom_attrib;
+
+/**
+ * Bitflags for geometry attributes.
+ * These are used in bitfields in many places.
+ */
+/*@{*/
+#define GEOM_BIT_COLOR0      (1 << GEOM_ATTRIB_COLOR0)
+#define GEOM_BIT_COLOR1      (1 << GEOM_ATTRIB_COLOR1)
+#define GEOM_BIT_SCOLOR0     (1 << GEOM_ATTRIB_SECONDARY_COLOR0)
+#define GEOM_BIT_SCOLOR1     (1 << GEOM_ATTRIB_SECONDARY_COLOR1)
+#define GEOM_BIT_TEX_COORD   (1 << GEOM_ATTRIB_TEX_COORD)
+#define GEOM_BIT_FOG_COORD   (1 << GEOM_ATTRIB_FOG_FRAG_COORD)
+#define GEOM_BIT_POSITION    (1 << GEOM_ATTRIB_POSITION)
+#define GEOM_BIT_POINT_SIDE  (1 << GEOM_ATTRIB_POINT_SIZE)
+#define GEOM_BIT_CLIP_VERTEX (1 << GEOM_ATTRIB_CLIP_VERTEX)
+#define GEOM_BIT_PRIM_ID     (1 << GEOM_ATTRIB_PRIMITIVE_ID)
+#define GEOM_BIT_VAR0        (1 << GEOM_ATTRIB_VAR0)
+
+#define GEOM_BIT_VAR(g)  (1 << (GEOM_BIT_VAR0 + (g)))
+/*@}*/
+
+
+/**
+ * Indexes for geometry program result attributes
+ */
+typedef enum
+{
+   GEOM_RESULT_POS  = 0,
+   GEOM_RESULT_COL0  = 1,
+   GEOM_RESULT_COL1  = 2,
+   GEOM_RESULT_SCOL0 = 3,
+   GEOM_RESULT_SCOL1 = 4,
+   GEOM_RESULT_FOGC = 5,
+   GEOM_RESULT_TEX0 = 6,
+   GEOM_RESULT_TEX1 = 7,
+   GEOM_RESULT_TEX2 = 8,
+   GEOM_RESULT_TEX3 = 9,
+   GEOM_RESULT_TEX4 = 10,
+   GEOM_RESULT_TEX5 = 11,
+   GEOM_RESULT_TEX6 = 12,
+   GEOM_RESULT_TEX7 = 13,
+   GEOM_RESULT_PSIZ = 14,
+   GEOM_RESULT_CLPV = 15,
+   GEOM_RESULT_PRID = 16,
+   GEOM_RESULT_LAYR = 17,
+   GEOM_RESULT_VAR0 = 18,  /**< shader varying, should really be 16 */
+   /* ### we need to -2 because var0 is 18 instead 16 like in the others */
+   GEOM_RESULT_MAX  =  (GEOM_RESULT_VAR0 + MAX_VARYING - 2)
+} gl_geom_result;
+
+
+/**
+ * Indexes for fragment program input attributes.
+ */
+typedef enum
+{
+   FRAG_ATTRIB_WPOS = 0,
+   FRAG_ATTRIB_COL0 = 1,
+   FRAG_ATTRIB_COL1 = 2,
+   FRAG_ATTRIB_FOGC = 3,
+   FRAG_ATTRIB_TEX0 = 4,
+   FRAG_ATTRIB_TEX1 = 5,
+   FRAG_ATTRIB_TEX2 = 6,
+   FRAG_ATTRIB_TEX3 = 7,
+   FRAG_ATTRIB_TEX4 = 8,
+   FRAG_ATTRIB_TEX5 = 9,
+   FRAG_ATTRIB_TEX6 = 10,
+   FRAG_ATTRIB_TEX7 = 11,
+   FRAG_ATTRIB_FACE = 12,  /**< front/back face */
+   FRAG_ATTRIB_PNTC = 13,  /**< sprite/point coord */
+   FRAG_ATTRIB_VAR0 = 14,  /**< shader varying */
+   FRAG_ATTRIB_MAX = (FRAG_ATTRIB_VAR0 + MAX_VARYING)
+} gl_frag_attrib;
+
+/**
+ * Bitflags for fragment program input attributes.
+ */
+/*@{*/
+#define FRAG_BIT_WPOS  (1 << FRAG_ATTRIB_WPOS)
+#define FRAG_BIT_COL0  (1 << FRAG_ATTRIB_COL0)
+#define FRAG_BIT_COL1  (1 << FRAG_ATTRIB_COL1)
+#define FRAG_BIT_FOGC  (1 << FRAG_ATTRIB_FOGC)
+#define FRAG_BIT_FACE  (1 << FRAG_ATTRIB_FACE)
+#define FRAG_BIT_PNTC  (1 << FRAG_ATTRIB_PNTC)
+#define FRAG_BIT_TEX0  (1 << FRAG_ATTRIB_TEX0)
+#define FRAG_BIT_TEX1  (1 << FRAG_ATTRIB_TEX1)
+#define FRAG_BIT_TEX2  (1 << FRAG_ATTRIB_TEX2)
+#define FRAG_BIT_TEX3  (1 << FRAG_ATTRIB_TEX3)
+#define FRAG_BIT_TEX4  (1 << FRAG_ATTRIB_TEX4)
+#define FRAG_BIT_TEX5  (1 << FRAG_ATTRIB_TEX5)
+#define FRAG_BIT_TEX6  (1 << FRAG_ATTRIB_TEX6)
+#define FRAG_BIT_TEX7  (1 << FRAG_ATTRIB_TEX7)
+#define FRAG_BIT_VAR0  (1 << FRAG_ATTRIB_VAR0)
+
+#define FRAG_BIT_TEX(U)  (FRAG_BIT_TEX0 << (U))
+#define FRAG_BIT_VAR(V)  (FRAG_BIT_VAR0 << (V))
+
+#define FRAG_BITS_TEX_ANY (FRAG_BIT_TEX0|	\
+			   FRAG_BIT_TEX1|	\
+			   FRAG_BIT_TEX2|	\
+			   FRAG_BIT_TEX3|	\
+			   FRAG_BIT_TEX4|	\
+			   FRAG_BIT_TEX5|	\
+			   FRAG_BIT_TEX6|	\
+			   FRAG_BIT_TEX7)
+/*@}*/
+
+
+/**
+ * Fragment program results
+ */
+typedef enum
+{
+   FRAG_RESULT_DEPTH = 0,
+   FRAG_RESULT_STENCIL = 1,
+   FRAG_RESULT_COLOR = 2,
+   FRAG_RESULT_DATA0 = 3,
+   FRAG_RESULT_MAX = (FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS)
+} gl_frag_result;
+
+
+/**
+ * Indexes for all renderbuffers
+ */
+typedef enum
+{
+   /* the four standard color buffers */
+   BUFFER_FRONT_LEFT,
+   BUFFER_BACK_LEFT,
+   BUFFER_FRONT_RIGHT,
+   BUFFER_BACK_RIGHT,
+   BUFFER_DEPTH,
+   BUFFER_STENCIL,
+   BUFFER_ACCUM,
+   /* optional aux buffer */
+   BUFFER_AUX0,
+   /* generic renderbuffers */
+   BUFFER_COLOR0,
+   BUFFER_COLOR1,
+   BUFFER_COLOR2,
+   BUFFER_COLOR3,
+   BUFFER_COLOR4,
+   BUFFER_COLOR5,
+   BUFFER_COLOR6,
+   BUFFER_COLOR7,
+   BUFFER_COUNT
+} gl_buffer_index;
+
+/**
+ * Bit flags for all renderbuffers
+ */
+#define BUFFER_BIT_FRONT_LEFT   (1 << BUFFER_FRONT_LEFT)
+#define BUFFER_BIT_BACK_LEFT    (1 << BUFFER_BACK_LEFT)
+#define BUFFER_BIT_FRONT_RIGHT  (1 << BUFFER_FRONT_RIGHT)
+#define BUFFER_BIT_BACK_RIGHT   (1 << BUFFER_BACK_RIGHT)
+#define BUFFER_BIT_AUX0         (1 << BUFFER_AUX0)
+#define BUFFER_BIT_AUX1         (1 << BUFFER_AUX1)
+#define BUFFER_BIT_AUX2         (1 << BUFFER_AUX2)
+#define BUFFER_BIT_AUX3         (1 << BUFFER_AUX3)
+#define BUFFER_BIT_DEPTH        (1 << BUFFER_DEPTH)
+#define BUFFER_BIT_STENCIL      (1 << BUFFER_STENCIL)
+#define BUFFER_BIT_ACCUM        (1 << BUFFER_ACCUM)
+#define BUFFER_BIT_COLOR0       (1 << BUFFER_COLOR0)
+#define BUFFER_BIT_COLOR1       (1 << BUFFER_COLOR1)
+#define BUFFER_BIT_COLOR2       (1 << BUFFER_COLOR2)
+#define BUFFER_BIT_COLOR3       (1 << BUFFER_COLOR3)
+#define BUFFER_BIT_COLOR4       (1 << BUFFER_COLOR4)
+#define BUFFER_BIT_COLOR5       (1 << BUFFER_COLOR5)
+#define BUFFER_BIT_COLOR6       (1 << BUFFER_COLOR6)
+#define BUFFER_BIT_COLOR7       (1 << BUFFER_COLOR7)
+
+/**
+ * Mask of all the color buffer bits (but not accum).
+ */
+#define BUFFER_BITS_COLOR  (BUFFER_BIT_FRONT_LEFT | \
+                            BUFFER_BIT_BACK_LEFT | \
+                            BUFFER_BIT_FRONT_RIGHT | \
+                            BUFFER_BIT_BACK_RIGHT | \
+                            BUFFER_BIT_AUX0 | \
+                            BUFFER_BIT_COLOR0 | \
+                            BUFFER_BIT_COLOR1 | \
+                            BUFFER_BIT_COLOR2 | \
+                            BUFFER_BIT_COLOR3 | \
+                            BUFFER_BIT_COLOR4 | \
+                            BUFFER_BIT_COLOR5 | \
+                            BUFFER_BIT_COLOR6 | \
+                            BUFFER_BIT_COLOR7)
+
+
+/**
+ * Framebuffer configuration (aka visual / pixelformat)
+ * Note: some of these fields should be boolean, but it appears that
+ * code in drivers/dri/common/util.c requires int-sized fields.
+ */
+struct gl_config
+{
+   GLboolean rgbMode;
+   GLboolean floatMode;
+   GLboolean colorIndexMode;  /* XXX is this used anywhere? */
+   GLuint doubleBufferMode;
+   GLuint stereoMode;
+
+   GLboolean haveAccumBuffer;
+   GLboolean haveDepthBuffer;
+   GLboolean haveStencilBuffer;
+
+   GLint redBits, greenBits, blueBits, alphaBits;	/* bits per comp */
+   GLuint redMask, greenMask, blueMask, alphaMask;
+   GLint rgbBits;		/* total bits for rgb */
+   GLint indexBits;		/* total bits for colorindex */
+
+   GLint accumRedBits, accumGreenBits, accumBlueBits, accumAlphaBits;
+   GLint depthBits;
+   GLint stencilBits;
+
+   GLint numAuxBuffers;
+
+   GLint level;
+
+   /* EXT_visual_rating / GLX 1.2 */
+   GLint visualRating;
+
+   /* EXT_visual_info / GLX 1.2 */
+   GLint transparentPixel;
+   /*    colors are floats scaled to ints */
+   GLint transparentRed, transparentGreen, transparentBlue, transparentAlpha;
+   GLint transparentIndex;
+
+   /* ARB_multisample / SGIS_multisample */
+   GLint sampleBuffers;
+   GLint samples;
+
+   /* SGIX_pbuffer / GLX 1.3 */
+   GLint maxPbufferWidth;
+   GLint maxPbufferHeight;
+   GLint maxPbufferPixels;
+   GLint optimalPbufferWidth;   /* Only for SGIX_pbuffer. */
+   GLint optimalPbufferHeight;  /* Only for SGIX_pbuffer. */
+
+   /* OML_swap_method */
+   GLint swapMethod;
+
+   /* EXT_texture_from_pixmap */
+   GLint bindToTextureRgb;
+   GLint bindToTextureRgba;
+   GLint bindToMipmapTexture;
+   GLint bindToTextureTargets;
+   GLint yInverted;
+
+   /* EXT_framebuffer_sRGB */
+   GLint sRGBCapable;
+};
+
+
+/**
+ * Data structure for color tables
+ */
+struct gl_color_table
+{
+   GLenum InternalFormat;      /**< The user-specified format */
+   GLenum _BaseFormat;         /**< GL_ALPHA, GL_RGBA, GL_RGB, etc */
+   GLuint Size;                /**< number of entries in table */
+   GLfloat *TableF;            /**< Color table, floating point values */
+   GLubyte *TableUB;           /**< Color table, ubyte values */
+   GLubyte RedSize;
+   GLubyte GreenSize;
+   GLubyte BlueSize;
+   GLubyte AlphaSize;
+   GLubyte LuminanceSize;
+   GLubyte IntensitySize;
+};
+
+
+/**
+ * \name Bit flags used for updating material values.
+ */
+/*@{*/
+#define MAT_ATTRIB_FRONT_AMBIENT           0 
+#define MAT_ATTRIB_BACK_AMBIENT            1
+#define MAT_ATTRIB_FRONT_DIFFUSE           2 
+#define MAT_ATTRIB_BACK_DIFFUSE            3
+#define MAT_ATTRIB_FRONT_SPECULAR          4 
+#define MAT_ATTRIB_BACK_SPECULAR           5
+#define MAT_ATTRIB_FRONT_EMISSION          6
+#define MAT_ATTRIB_BACK_EMISSION           7
+#define MAT_ATTRIB_FRONT_SHININESS         8
+#define MAT_ATTRIB_BACK_SHININESS          9
+#define MAT_ATTRIB_FRONT_INDEXES           10
+#define MAT_ATTRIB_BACK_INDEXES            11
+#define MAT_ATTRIB_MAX                     12
+
+#define MAT_ATTRIB_AMBIENT(f)  (MAT_ATTRIB_FRONT_AMBIENT+(f))  
+#define MAT_ATTRIB_DIFFUSE(f)  (MAT_ATTRIB_FRONT_DIFFUSE+(f))  
+#define MAT_ATTRIB_SPECULAR(f) (MAT_ATTRIB_FRONT_SPECULAR+(f)) 
+#define MAT_ATTRIB_EMISSION(f) (MAT_ATTRIB_FRONT_EMISSION+(f)) 
+#define MAT_ATTRIB_SHININESS(f)(MAT_ATTRIB_FRONT_SHININESS+(f))
+#define MAT_ATTRIB_INDEXES(f)  (MAT_ATTRIB_FRONT_INDEXES+(f))  
+
+#define MAT_INDEX_AMBIENT  0
+#define MAT_INDEX_DIFFUSE  1
+#define MAT_INDEX_SPECULAR 2
+
+#define MAT_BIT_FRONT_AMBIENT         (1<<MAT_ATTRIB_FRONT_AMBIENT)
+#define MAT_BIT_BACK_AMBIENT          (1<<MAT_ATTRIB_BACK_AMBIENT)
+#define MAT_BIT_FRONT_DIFFUSE         (1<<MAT_ATTRIB_FRONT_DIFFUSE)
+#define MAT_BIT_BACK_DIFFUSE          (1<<MAT_ATTRIB_BACK_DIFFUSE)
+#define MAT_BIT_FRONT_SPECULAR        (1<<MAT_ATTRIB_FRONT_SPECULAR)
+#define MAT_BIT_BACK_SPECULAR         (1<<MAT_ATTRIB_BACK_SPECULAR)
+#define MAT_BIT_FRONT_EMISSION        (1<<MAT_ATTRIB_FRONT_EMISSION)
+#define MAT_BIT_BACK_EMISSION         (1<<MAT_ATTRIB_BACK_EMISSION)
+#define MAT_BIT_FRONT_SHININESS       (1<<MAT_ATTRIB_FRONT_SHININESS)
+#define MAT_BIT_BACK_SHININESS        (1<<MAT_ATTRIB_BACK_SHININESS)
+#define MAT_BIT_FRONT_INDEXES         (1<<MAT_ATTRIB_FRONT_INDEXES)
+#define MAT_BIT_BACK_INDEXES          (1<<MAT_ATTRIB_BACK_INDEXES)
+
+
+#define FRONT_MATERIAL_BITS	(MAT_BIT_FRONT_EMISSION | 	\
+				 MAT_BIT_FRONT_AMBIENT |	\
+				 MAT_BIT_FRONT_DIFFUSE | 	\
+				 MAT_BIT_FRONT_SPECULAR |	\
+				 MAT_BIT_FRONT_SHININESS | 	\
+				 MAT_BIT_FRONT_INDEXES)
+
+#define BACK_MATERIAL_BITS	(MAT_BIT_BACK_EMISSION |	\
+				 MAT_BIT_BACK_AMBIENT |		\
+				 MAT_BIT_BACK_DIFFUSE |		\
+				 MAT_BIT_BACK_SPECULAR |	\
+				 MAT_BIT_BACK_SHININESS |	\
+				 MAT_BIT_BACK_INDEXES)
+
+#define ALL_MATERIAL_BITS	(FRONT_MATERIAL_BITS | BACK_MATERIAL_BITS)
+/*@}*/
+
+
+#define EXP_TABLE_SIZE 512	/**< Specular exponent lookup table sizes */
+#define SHINE_TABLE_SIZE 256	/**< Material shininess lookup table sizes */
+
+/**
+ * Material shininess lookup table.
+ */
+struct gl_shine_tab
+{
+   struct gl_shine_tab *next, *prev;
+   GLfloat tab[SHINE_TABLE_SIZE+1];
+   GLfloat shininess;
+   GLuint refcount;
+};
+
+
+/**
+ * Light source state.
+ */
+struct gl_light
+{
+   struct gl_light *next;	/**< double linked list with sentinel */
+   struct gl_light *prev;
+
+   GLfloat Ambient[4];		/**< ambient color */
+   GLfloat Diffuse[4];		/**< diffuse color */
+   GLfloat Specular[4];		/**< specular color */
+   GLfloat EyePosition[4];	/**< position in eye coordinates */
+   GLfloat SpotDirection[4];	/**< spotlight direction in eye coordinates */
+   GLfloat SpotExponent;
+   GLfloat SpotCutoff;		/**< in degrees */
+   GLfloat _CosCutoffNeg;	/**< = cos(SpotCutoff) */
+   GLfloat _CosCutoff;		/**< = MAX(0, cos(SpotCutoff)) */
+   GLfloat ConstantAttenuation;
+   GLfloat LinearAttenuation;
+   GLfloat QuadraticAttenuation;
+   GLboolean Enabled;		/**< On/off flag */
+
+   /** 
+    * \name Derived fields
+    */
+   /*@{*/
+   GLbitfield _Flags;		/**< State */
+
+   GLfloat _Position[4];	/**< position in eye/obj coordinates */
+   GLfloat _VP_inf_norm[3];	/**< Norm direction to infinite light */
+   GLfloat _h_inf_norm[3];	/**< Norm( _VP_inf_norm + <0,0,1> ) */
+   GLfloat _NormSpotDirection[4]; /**< normalized spotlight direction */
+   GLfloat _VP_inf_spot_attenuation;
+
+   GLfloat _SpotExpTable[EXP_TABLE_SIZE][2];  /**< to replace a pow() call */
+   GLfloat _MatAmbient[2][3];	/**< material ambient * light ambient */
+   GLfloat _MatDiffuse[2][3];	/**< material diffuse * light diffuse */
+   GLfloat _MatSpecular[2][3];	/**< material spec * light specular */
+   GLfloat _dli;		/**< CI diffuse light intensity */
+   GLfloat _sli;		/**< CI specular light intensity */
+   /*@}*/
+};
+
+
+/**
+ * Light model state.
+ */
+struct gl_lightmodel
+{
+   GLfloat Ambient[4];		/**< ambient color */
+   GLboolean LocalViewer;	/**< Local (or infinite) view point? */
+   GLboolean TwoSide;		/**< Two (or one) sided lighting? */
+   GLenum ColorControl;		/**< either GL_SINGLE_COLOR
+				 *    or GL_SEPARATE_SPECULAR_COLOR */
+};
+
+
+/**
+ * Material state.
+ */
+struct gl_material
+{
+   GLfloat Attrib[MAT_ATTRIB_MAX][4];
+};
+
+
+/**
+ * Accumulation buffer attribute group (GL_ACCUM_BUFFER_BIT)
+ */
+struct gl_accum_attrib
+{
+   GLfloat ClearColor[4];	/**< Accumulation buffer clear color */
+};
+
+
+/**
+ * Color buffer attribute group (GL_COLOR_BUFFER_BIT).
+ */
+struct gl_colorbuffer_attrib
+{
+   GLuint ClearIndex;			/**< Index to use for glClear */
+   GLclampf ClearColor[4];		/**< Color to use for glClear */
+
+   GLuint IndexMask;			/**< Color index write mask */
+   GLubyte ColorMask[MAX_DRAW_BUFFERS][4];/**< Each flag is 0xff or 0x0 */
+
+   GLenum DrawBuffer[MAX_DRAW_BUFFERS];	/**< Which buffer to draw into */
+
+   /** 
+    * \name alpha testing
+    */
+   /*@{*/
+   GLboolean AlphaEnabled;		/**< Alpha test enabled flag */
+   GLenum AlphaFunc;			/**< Alpha test function */
+   GLclampf AlphaRef;			/**< Alpha reference value */
+   /*@}*/
+
+   /** 
+    * \name Blending
+    */
+   /*@{*/
+   GLbitfield BlendEnabled;		/**< Per-buffer blend enable flags */
+   GLfloat BlendColor[4];		/**< Blending color */
+   struct
+   {
+      GLenum SrcRGB;             /**< RGB blend source term */
+      GLenum DstRGB;             /**< RGB blend dest term */
+      GLenum SrcA;               /**< Alpha blend source term */
+      GLenum DstA;               /**< Alpha blend dest term */
+      GLenum EquationRGB;        /**< GL_ADD, GL_SUBTRACT, etc. */
+      GLenum EquationA;          /**< GL_ADD, GL_SUBTRACT, etc. */
+   } Blend[MAX_DRAW_BUFFERS];
+   /** Are the blend func terms currently different for each buffer/target? */
+   GLboolean _BlendFuncPerBuffer;
+   /** Are the blend equations currently different for each buffer/target? */
+   GLboolean _BlendEquationPerBuffer;
+   /*@}*/
+
+   /** 
+    * \name Logic op
+    */
+   /*@{*/
+   GLenum LogicOp;			/**< Logic operator */
+   GLboolean IndexLogicOpEnabled;	/**< Color index logic op enabled flag */
+   GLboolean ColorLogicOpEnabled;	/**< RGBA logic op enabled flag */
+   GLboolean _LogicOpEnabled;		/**< RGBA logic op + EXT_blend_logic_op enabled flag */
+   /*@}*/
+
+   GLboolean DitherFlag;		/**< Dither enable flag */
+
+   GLenum ClampFragmentColor; /**< GL_TRUE, GL_FALSE or GL_FIXED_ONLY_ARB */
+   GLenum ClampReadColor;     /**< GL_TRUE, GL_FALSE or GL_FIXED_ONLY_ARB */
+
+   GLboolean sRGBEnabled;	/**< Framebuffer sRGB blending/updating requested */
+};
+
+
+/**
+ * Current attribute group (GL_CURRENT_BIT).
+ */
+struct gl_current_attrib
+{
+   /**
+    * \name Current vertex attributes.
+    * \note Values are valid only after FLUSH_VERTICES has been called.
+    * \note Index and Edgeflag current values are stored as floats in the 
+    * SIX and SEVEN attribute slots.
+    */
+   GLfloat Attrib[VERT_ATTRIB_MAX][4];	/**< Position, color, texcoords, etc */
+
+   /**
+    * \name Current raster position attributes (always valid).
+    * \note This set of attributes is very similar to the SWvertex struct.
+    */
+   /*@{*/
+   GLfloat RasterPos[4];
+   GLfloat RasterDistance;
+   GLfloat RasterColor[4];
+   GLfloat RasterSecondaryColor[4];
+   GLfloat RasterTexCoords[MAX_TEXTURE_COORD_UNITS][4];
+   GLboolean RasterPosValid;
+   /*@}*/
+};
+
+
+/**
+ * Depth buffer attribute group (GL_DEPTH_BUFFER_BIT).
+ */
+struct gl_depthbuffer_attrib
+{
+   GLenum Func;			/**< Function for depth buffer compare */
+   GLclampd Clear;		/**< Value to clear depth buffer to */
+   GLboolean Test;		/**< Depth buffering enabled flag */
+   GLboolean Mask;		/**< Depth buffer writable? */
+   GLboolean BoundsTest;        /**< GL_EXT_depth_bounds_test */
+   GLfloat BoundsMin, BoundsMax;/**< GL_EXT_depth_bounds_test */
+};
+
+
+/**
+ * Evaluator attribute group (GL_EVAL_BIT).
+ */
+struct gl_eval_attrib
+{
+   /**
+    * \name Enable bits 
+    */
+   /*@{*/
+   GLboolean Map1Color4;
+   GLboolean Map1Index;
+   GLboolean Map1Normal;
+   GLboolean Map1TextureCoord1;
+   GLboolean Map1TextureCoord2;
+   GLboolean Map1TextureCoord3;
+   GLboolean Map1TextureCoord4;
+   GLboolean Map1Vertex3;
+   GLboolean Map1Vertex4;
+   GLboolean Map1Attrib[16];  /* GL_NV_vertex_program */
+   GLboolean Map2Color4;
+   GLboolean Map2Index;
+   GLboolean Map2Normal;
+   GLboolean Map2TextureCoord1;
+   GLboolean Map2TextureCoord2;
+   GLboolean Map2TextureCoord3;
+   GLboolean Map2TextureCoord4;
+   GLboolean Map2Vertex3;
+   GLboolean Map2Vertex4;
+   GLboolean Map2Attrib[16];  /* GL_NV_vertex_program */
+   GLboolean AutoNormal;
+   /*@}*/
+   
+   /**
+    * \name Map Grid endpoints and divisions and calculated du values
+    */
+   /*@{*/
+   GLint MapGrid1un;
+   GLfloat MapGrid1u1, MapGrid1u2, MapGrid1du;
+   GLint MapGrid2un, MapGrid2vn;
+   GLfloat MapGrid2u1, MapGrid2u2, MapGrid2du;
+   GLfloat MapGrid2v1, MapGrid2v2, MapGrid2dv;
+   /*@}*/
+};
+
+
+/**
+ * Fog attribute group (GL_FOG_BIT).
+ */
+struct gl_fog_attrib
+{
+   GLboolean Enabled;		/**< Fog enabled flag */
+   GLfloat Color[4];		/**< Fog color */
+   GLfloat Density;		/**< Density >= 0.0 */
+   GLfloat Start;		/**< Start distance in eye coords */
+   GLfloat End;			/**< End distance in eye coords */
+   GLfloat Index;		/**< Fog index */
+   GLenum Mode;			/**< Fog mode */
+   GLboolean ColorSumEnabled;
+   GLenum FogCoordinateSource;  /**< GL_EXT_fog_coord */
+   GLfloat _Scale;		/**< (End == Start) ? 1.0 : 1.0 / (End - Start) */
+};
+
+
+/**
+ * \brief Layout qualifiers for gl_FragDepth.
+ *
+ * Extension AMD_conservative_depth allows gl_FragDepth to be redeclared with
+ * a layout qualifier.
+ *
+ * \see enum ir_depth_layout
+ */
+enum gl_frag_depth_layout {
+    FRAG_DEPTH_LAYOUT_NONE, /**< No layout is specified. */
+    FRAG_DEPTH_LAYOUT_ANY,
+    FRAG_DEPTH_LAYOUT_GREATER,
+    FRAG_DEPTH_LAYOUT_LESS,
+    FRAG_DEPTH_LAYOUT_UNCHANGED
+};
+
+
+/** 
+ * Hint attribute group (GL_HINT_BIT).
+ * 
+ * Values are always one of GL_FASTEST, GL_NICEST, or GL_DONT_CARE.
+ */
+struct gl_hint_attrib
+{
+   GLenum PerspectiveCorrection;
+   GLenum PointSmooth;
+   GLenum LineSmooth;
+   GLenum PolygonSmooth;
+   GLenum Fog;
+   GLenum ClipVolumeClipping;   /**< GL_EXT_clip_volume_hint */
+   GLenum TextureCompression;   /**< GL_ARB_texture_compression */
+   GLenum GenerateMipmap;       /**< GL_SGIS_generate_mipmap */
+   GLenum FragmentShaderDerivative; /**< GL_ARB_fragment_shader */
+};
+
+/**
+ * Light state flags.
+ */
+/*@{*/
+#define LIGHT_SPOT         0x1
+#define LIGHT_LOCAL_VIEWER 0x2
+#define LIGHT_POSITIONAL   0x4
+#define LIGHT_NEED_VERTICES (LIGHT_POSITIONAL|LIGHT_LOCAL_VIEWER)
+/*@}*/
+
+
+/**
+ * Lighting attribute group (GL_LIGHT_BIT).
+ */
+struct gl_light_attrib
+{
+   struct gl_light Light[MAX_LIGHTS];	/**< Array of light sources */
+   struct gl_lightmodel Model;		/**< Lighting model */
+
+   /**
+    * Must flush FLUSH_VERTICES before referencing:
+    */
+   /*@{*/
+   struct gl_material Material; 	/**< Includes front & back values */
+   /*@}*/
+
+   GLboolean Enabled;			/**< Lighting enabled flag */
+   GLenum ShadeModel;			/**< GL_FLAT or GL_SMOOTH */
+   GLenum ProvokingVertex;              /**< GL_EXT_provoking_vertex */
+   GLenum ColorMaterialFace;		/**< GL_FRONT, BACK or FRONT_AND_BACK */
+   GLenum ColorMaterialMode;		/**< GL_AMBIENT, GL_DIFFUSE, etc */
+   GLbitfield ColorMaterialBitmask;	/**< bitmask formed from Face and Mode */
+   GLboolean ColorMaterialEnabled;
+   GLenum ClampVertexColor;
+
+   struct gl_light EnabledList;         /**< List sentinel */
+
+   /** 
+    * Derived state for optimizations: 
+    */
+   /*@{*/
+   GLboolean _NeedEyeCoords;		
+   GLboolean _NeedVertices;		/**< Use fast shader? */
+   GLbitfield _Flags;		        /**< LIGHT_* flags, see above */
+   GLfloat _BaseColor[2][3];
+   /*@}*/
+};
+
+
+/**
+ * Line attribute group (GL_LINE_BIT).
+ */
+struct gl_line_attrib
+{
+   GLboolean SmoothFlag;	/**< GL_LINE_SMOOTH enabled? */
+   GLboolean StippleFlag;	/**< GL_LINE_STIPPLE enabled? */
+   GLushort StipplePattern;	/**< Stipple pattern */
+   GLint StippleFactor;		/**< Stipple repeat factor */
+   GLfloat Width;		/**< Line width */
+};
+
+
+/**
+ * Display list attribute group (GL_LIST_BIT).
+ */
+struct gl_list_attrib
+{
+   GLuint ListBase;
+};
+
+
+/**
+ * Multisample attribute group (GL_MULTISAMPLE_BIT).
+ */
+struct gl_multisample_attrib
+{
+   GLboolean Enabled;
+   GLboolean _Enabled;   /**< true if Enabled and multisample buffer */
+   GLboolean SampleAlphaToCoverage;
+   GLboolean SampleAlphaToOne;
+   GLboolean SampleCoverage;
+   GLfloat SampleCoverageValue;
+   GLboolean SampleCoverageInvert;
+};
+
+
+/**
+ * A pixelmap (see glPixelMap)
+ */
+struct gl_pixelmap
+{
+   GLint Size;
+   GLfloat Map[MAX_PIXEL_MAP_TABLE];
+   GLubyte Map8[MAX_PIXEL_MAP_TABLE];  /**< converted to 8-bit color */
+};
+
+
+/**
+ * Collection of all pixelmaps
+ */
+struct gl_pixelmaps
+{
+   struct gl_pixelmap RtoR;  /**< i.e. GL_PIXEL_MAP_R_TO_R */
+   struct gl_pixelmap GtoG;
+   struct gl_pixelmap BtoB;
+   struct gl_pixelmap AtoA;
+   struct gl_pixelmap ItoR;
+   struct gl_pixelmap ItoG;
+   struct gl_pixelmap ItoB;
+   struct gl_pixelmap ItoA;
+   struct gl_pixelmap ItoI;
+   struct gl_pixelmap StoS;
+};
+
+
+/**
+ * Pixel attribute group (GL_PIXEL_MODE_BIT).
+ */
+struct gl_pixel_attrib
+{
+   GLenum ReadBuffer;		/**< source buffer for glRead/CopyPixels() */
+
+   /*--- Begin Pixel Transfer State ---*/
+   /* Fields are in the order in which they're applied... */
+
+   /** Scale & Bias (index shift, offset) */
+   /*@{*/
+   GLfloat RedBias, RedScale;
+   GLfloat GreenBias, GreenScale;
+   GLfloat BlueBias, BlueScale;
+   GLfloat AlphaBias, AlphaScale;
+   GLfloat DepthBias, DepthScale;
+   GLint IndexShift, IndexOffset;
+   /*@}*/
+
+   /* Pixel Maps */
+   /* Note: actual pixel maps are not part of this attrib group */
+   GLboolean MapColorFlag;
+   GLboolean MapStencilFlag;
+
+   /*--- End Pixel Transfer State ---*/
+
+   /** glPixelZoom */
+   GLfloat ZoomX, ZoomY;
+
+   /** GL_SGI_texture_color_table */
+   GLfloat TextureColorTableScale[4]; /**< RGBA */
+   GLfloat TextureColorTableBias[4];  /**< RGBA */
+};
+
+
+/**
+ * Point attribute group (GL_POINT_BIT).
+ */
+struct gl_point_attrib
+{
+   GLboolean SmoothFlag;	/**< True if GL_POINT_SMOOTH is enabled */
+   GLfloat Size;		/**< User-specified point size */
+   GLfloat Params[3];		/**< GL_EXT_point_parameters */
+   GLfloat MinSize, MaxSize;	/**< GL_EXT_point_parameters */
+   GLfloat Threshold;		/**< GL_EXT_point_parameters */
+   GLboolean _Attenuated;	/**< True if Params != [1, 0, 0] */
+   GLboolean PointSprite;	/**< GL_NV/ARB_point_sprite */
+   GLboolean CoordReplace[MAX_TEXTURE_COORD_UNITS]; /**< GL_ARB_point_sprite*/
+   GLenum SpriteRMode;		/**< GL_NV_point_sprite (only!) */
+   GLenum SpriteOrigin;		/**< GL_ARB_point_sprite */
+};
+
+
+/**
+ * Polygon attribute group (GL_POLYGON_BIT).
+ */
+struct gl_polygon_attrib
+{
+   GLenum FrontFace;		/**< Either GL_CW or GL_CCW */
+   GLenum FrontMode;		/**< Either GL_POINT, GL_LINE or GL_FILL */
+   GLenum BackMode;		/**< Either GL_POINT, GL_LINE or GL_FILL */
+   GLboolean _FrontBit;		/**< 0=GL_CCW, 1=GL_CW */
+   GLboolean CullFlag;		/**< Culling on/off flag */
+   GLboolean SmoothFlag;	/**< True if GL_POLYGON_SMOOTH is enabled */
+   GLboolean StippleFlag;	/**< True if GL_POLYGON_STIPPLE is enabled */
+   GLenum CullFaceMode;		/**< Culling mode GL_FRONT or GL_BACK */
+   GLfloat OffsetFactor;	/**< Polygon offset factor, from user */
+   GLfloat OffsetUnits;		/**< Polygon offset units, from user */
+   GLboolean OffsetPoint;	/**< Offset in GL_POINT mode */
+   GLboolean OffsetLine;	/**< Offset in GL_LINE mode */
+   GLboolean OffsetFill;	/**< Offset in GL_FILL mode */
+};
+
+
+/**
+ * Scissor attributes (GL_SCISSOR_BIT).
+ */
+struct gl_scissor_attrib
+{
+   GLboolean Enabled;		/**< Scissor test enabled? */
+   GLint X, Y;			/**< Lower left corner of box */
+   GLsizei Width, Height;	/**< Size of box */
+};
+
+
+/**
+ * Stencil attribute group (GL_STENCIL_BUFFER_BIT).
+ *
+ * Three sets of stencil data are tracked so that OpenGL 2.0,
+ * GL_EXT_stencil_two_side, and GL_ATI_separate_stencil can all be supported
+ * simultaneously.  In each of the stencil state arrays, element 0 corresponds
+ * to GL_FRONT.  Element 1 corresponds to the OpenGL 2.0 /
+ * GL_ATI_separate_stencil GL_BACK state.  Element 2 corresponds to the
+ * GL_EXT_stencil_two_side GL_BACK state.
+ *
+ * The derived value \c _BackFace is either 1 or 2 depending on whether or
+ * not GL_STENCIL_TEST_TWO_SIDE_EXT is enabled.
+ *
+ * The derived value \c _TestTwoSide is set when the front-face and back-face
+ * stencil state are different.
+ */
+struct gl_stencil_attrib
+{
+   GLboolean Enabled;		/**< Enabled flag */
+   GLboolean TestTwoSide;	/**< GL_EXT_stencil_two_side */
+   GLubyte ActiveFace;		/**< GL_EXT_stencil_two_side (0 or 2) */
+   GLboolean _Enabled;          /**< Enabled and stencil buffer present */
+   GLboolean _TestTwoSide;
+   GLubyte _BackFace;           /**< Current back stencil state (1 or 2) */
+   GLenum Function[3];		/**< Stencil function */
+   GLenum FailFunc[3];		/**< Fail function */
+   GLenum ZPassFunc[3];		/**< Depth buffer pass function */
+   GLenum ZFailFunc[3];		/**< Depth buffer fail function */
+   GLint Ref[3];		/**< Reference value */
+   GLuint ValueMask[3];		/**< Value mask */
+   GLuint WriteMask[3];		/**< Write mask */
+   GLuint Clear;		/**< Clear value */
+};
+
+
+/**
+ * An index for each type of texture object.  These correspond to the GL
+ * texture target enums, such as GL_TEXTURE_2D, GL_TEXTURE_CUBE_MAP, etc.
+ * Note: the order is from highest priority to lowest priority.
+ */
+typedef enum
+{
+   TEXTURE_2D_ARRAY_INDEX,
+   TEXTURE_1D_ARRAY_INDEX,
+   TEXTURE_CUBE_INDEX,
+   TEXTURE_3D_INDEX,
+   TEXTURE_RECT_INDEX,
+   TEXTURE_2D_INDEX,
+   TEXTURE_1D_INDEX,
+   NUM_TEXTURE_TARGETS
+} gl_texture_index;
+
+
+/**
+ * Bit flags for each type of texture object
+ * Used for Texture.Unit[]._ReallyEnabled flags.
+ */
+/*@{*/
+#define TEXTURE_2D_ARRAY_BIT (1 << TEXTURE_2D_ARRAY_INDEX)
+#define TEXTURE_1D_ARRAY_BIT (1 << TEXTURE_1D_ARRAY_INDEX)
+#define TEXTURE_CUBE_BIT     (1 << TEXTURE_CUBE_INDEX)
+#define TEXTURE_3D_BIT       (1 << TEXTURE_3D_INDEX)
+#define TEXTURE_RECT_BIT     (1 << TEXTURE_RECT_INDEX)
+#define TEXTURE_2D_BIT       (1 << TEXTURE_2D_INDEX)
+#define TEXTURE_1D_BIT       (1 << TEXTURE_1D_INDEX)
+/*@}*/
+
+
+/**
+ * TexGenEnabled flags.
+ */
+/*@{*/
+#define S_BIT 1
+#define T_BIT 2
+#define R_BIT 4
+#define Q_BIT 8
+#define STR_BITS (S_BIT | T_BIT | R_BIT)
+/*@}*/
+
+
+/**
+ * Bit flag versions of the corresponding GL_ constants.
+ */
+/*@{*/
+#define TEXGEN_SPHERE_MAP        0x1
+#define TEXGEN_OBJ_LINEAR        0x2
+#define TEXGEN_EYE_LINEAR        0x4
+#define TEXGEN_REFLECTION_MAP_NV 0x8
+#define TEXGEN_NORMAL_MAP_NV     0x10
+
+#define TEXGEN_NEED_NORMALS      (TEXGEN_SPHERE_MAP        | \
+				  TEXGEN_REFLECTION_MAP_NV | \
+				  TEXGEN_NORMAL_MAP_NV)
+#define TEXGEN_NEED_EYE_COORD    (TEXGEN_SPHERE_MAP        | \
+				  TEXGEN_REFLECTION_MAP_NV | \
+				  TEXGEN_NORMAL_MAP_NV     | \
+				  TEXGEN_EYE_LINEAR)
+/*@}*/
+
+
+
+/** Tex-gen enabled for texture unit? */
+#define ENABLE_TEXGEN(unit) (1 << (unit))
+
+/** Non-identity texture matrix for texture unit? */
+#define ENABLE_TEXMAT(unit) (1 << (unit))
+
+
+/**
+ * Texel fetch function prototype.  We use texel fetch functions to
+ * extract RGBA, color indexes and depth components out of 1D, 2D and 3D
+ * texture images.  These functions help to isolate us from the gritty
+ * details of all the various texture image encodings.
+ * 
+ * \param texImage texture image.
+ * \param col texel column.
+ * \param row texel row.
+ * \param img texel image level/layer.
+ * \param texelOut output texel (up to 4 GLchans)
+ */
+typedef void (*FetchTexelFuncC)( const struct gl_texture_image *texImage,
+                                 GLint col, GLint row, GLint img,
+                                 GLchan *texelOut );
+
+/**
+ * As above, but returns floats.
+ * Used for depth component images and for upcoming signed/float
+ * texture images.
+ */
+typedef void (*FetchTexelFuncF)( const struct gl_texture_image *texImage,
+                                 GLint col, GLint row, GLint img,
+                                 GLfloat *texelOut );
+
+
+typedef void (*StoreTexelFunc)(struct gl_texture_image *texImage,
+                               GLint col, GLint row, GLint img,
+                               const void *texel);
+
+
+/**
+ * Texture image state.  Describes the dimensions of a texture image,
+ * the texel format and pointers to Texel Fetch functions.
+ */
+struct gl_texture_image
+{
+   GLint InternalFormat;	/**< Internal format as given by the user */
+   GLenum _BaseFormat;		/**< Either GL_RGB, GL_RGBA, GL_ALPHA,
+				 *   GL_LUMINANCE, GL_LUMINANCE_ALPHA,
+				 *   GL_INTENSITY, GL_COLOR_INDEX,
+				 *   GL_DEPTH_COMPONENT or GL_DEPTH_STENCIL_EXT
+                                 *   only. Used for choosing TexEnv arithmetic.
+				 */
+   gl_format TexFormat;         /**< The actual texture memory format */
+
+   GLuint Border;		/**< 0 or 1 */
+   GLuint Width;		/**< = 2^WidthLog2 + 2*Border */
+   GLuint Height;		/**< = 2^HeightLog2 + 2*Border */
+   GLuint Depth;		/**< = 2^DepthLog2 + 2*Border */
+   GLuint Width2;		/**< = Width - 2*Border */
+   GLuint Height2;		/**< = Height - 2*Border */
+   GLuint Depth2;		/**< = Depth - 2*Border */
+   GLuint WidthLog2;		/**< = log2(Width2) */
+   GLuint HeightLog2;		/**< = log2(Height2) */
+   GLuint DepthLog2;		/**< = log2(Depth2) */
+   GLuint MaxLog2;		/**< = MAX(WidthLog2, HeightLog2) */
+   GLfloat WidthScale;		/**< used for mipmap LOD computation */
+   GLfloat HeightScale;		/**< used for mipmap LOD computation */
+   GLfloat DepthScale;		/**< used for mipmap LOD computation */
+   GLboolean IsClientData;	/**< Data owned by client? */
+   GLboolean _IsPowerOfTwo;	/**< Are all dimensions powers of two? */
+
+   struct gl_texture_object *TexObject;  /**< Pointer back to parent object */
+
+   FetchTexelFuncC FetchTexelc;	/**< GLchan texel fetch function pointer */
+   FetchTexelFuncF FetchTexelf;	/**< Float texel fetch function pointer */
+
+   GLuint RowStride;		/**< Padded width in units of texels */
+   GLuint *ImageOffsets;        /**< if 3D texture: array [Depth] of offsets to
+                                     each 2D slice in 'Data', in texels */
+   GLvoid *Data;		/**< Image data, accessed via FetchTexel() */
+
+   /**
+    * \name For device driver:
+    */
+   /*@{*/
+   void *DriverData;		/**< Arbitrary device driver data */
+   /*@}*/
+};
+
+
+/**
+ * Indexes for cube map faces.
+ */
+typedef enum
+{
+   FACE_POS_X = 0,
+   FACE_NEG_X = 1,
+   FACE_POS_Y = 2,
+   FACE_NEG_Y = 3,
+   FACE_POS_Z = 4,
+   FACE_NEG_Z = 5,
+   MAX_FACES = 6
+} gl_face_index;
+
+
+/**
+ * Texture object state.  Contains the array of mipmap images, border color,
+ * wrap modes, filter modes, shadow/texcompare state, and the per-texture
+ * color palette.
+ */
+struct gl_texture_object
+{
+   _glthread_Mutex Mutex;	/**< for thread safety */
+   GLint RefCount;		/**< reference count */
+   GLuint Name;			/**< the user-visible texture object ID */
+   GLenum Target;               /**< GL_TEXTURE_1D, GL_TEXTURE_2D, etc. */
+   GLfloat Priority;		/**< in [0,1] */
+   union {
+      GLfloat f[4];
+      GLuint ui[4];
+      GLint i[4];
+   } BorderColor;               /**< Interpreted according to texture format */
+   GLenum WrapS;		/**< S-axis texture image wrap mode */
+   GLenum WrapT;		/**< T-axis texture image wrap mode */
+   GLenum WrapR;		/**< R-axis texture image wrap mode */
+   GLenum MinFilter;		/**< minification filter */
+   GLenum MagFilter;		/**< magnification filter */
+   GLfloat MinLod;		/**< min lambda, OpenGL 1.2 */
+   GLfloat MaxLod;		/**< max lambda, OpenGL 1.2 */
+   GLfloat LodBias;		/**< OpenGL 1.4 */
+   GLint BaseLevel;		/**< min mipmap level, OpenGL 1.2 */
+   GLint MaxLevel;		/**< max mipmap level, OpenGL 1.2 */
+   GLfloat MaxAnisotropy;	/**< GL_EXT_texture_filter_anisotropic */
+   GLenum CompareMode;		/**< GL_ARB_shadow */
+   GLenum CompareFunc;		/**< GL_ARB_shadow */
+   GLfloat CompareFailValue;    /**< GL_ARB_shadow_ambient */
+   GLenum DepthMode;		/**< GL_ARB_depth_texture */
+   GLint _MaxLevel;		/**< actual max mipmap level (q in the spec) */
+   GLfloat _MaxLambda;		/**< = _MaxLevel - BaseLevel (q - b in spec) */
+   GLint CropRect[4];           /**< GL_OES_draw_texture */
+   GLenum Swizzle[4];           /**< GL_EXT_texture_swizzle */
+   GLuint _Swizzle;             /**< same as Swizzle, but SWIZZLE_* format */
+   GLboolean GenerateMipmap;    /**< GL_SGIS_generate_mipmap */
+   GLboolean _Complete;		/**< Is texture object complete? */
+   GLboolean _RenderToTexture;  /**< Any rendering to this texture? */
+   GLboolean Purgeable;         /**< Is the buffer purgeable under memory pressure? */
+   GLenum sRGBDecode;           /**< GL_DECODE_EXT or GL_SKIP_DECODE_EXT */
+
+   /** Actual texture images, indexed by [cube face] and [mipmap level] */
+   struct gl_texture_image *Image[MAX_FACES][MAX_TEXTURE_LEVELS];
+
+   /** GL_EXT_paletted_texture */
+   struct gl_color_table Palette;
+
+   /**
+    * \name For device driver.
+    * Note: instead of attaching driver data to this pointer, it's preferable
+    * to instead use this struct as a base class for your own texture object
+    * class.  Driver->NewTextureObject() can be used to implement the
+    * allocation.
+    */
+   void *DriverData;	/**< Arbitrary device driver data */
+};
+
+
+/** Up to four combiner sources are possible with GL_NV_texture_env_combine4 */
+#define MAX_COMBINER_TERMS 4
+
+
+/**
+ * Texture combine environment state.
+ */
+struct gl_tex_env_combine_state
+{
+   GLenum ModeRGB;       /**< GL_REPLACE, GL_DECAL, GL_ADD, etc. */
+   GLenum ModeA;         /**< GL_REPLACE, GL_DECAL, GL_ADD, etc. */
+   /** Source terms: GL_PRIMARY_COLOR, GL_TEXTURE, etc */
+   GLenum SourceRGB[MAX_COMBINER_TERMS];
+   GLenum SourceA[MAX_COMBINER_TERMS];
+   /** Source operands: GL_SRC_COLOR, GL_ONE_MINUS_SRC_COLOR, etc */
+   GLenum OperandRGB[MAX_COMBINER_TERMS];
+   GLenum OperandA[MAX_COMBINER_TERMS];
+   GLuint ScaleShiftRGB; /**< 0, 1 or 2 */
+   GLuint ScaleShiftA;   /**< 0, 1 or 2 */
+   GLuint _NumArgsRGB;   /**< Number of inputs used for the RGB combiner */
+   GLuint _NumArgsA;     /**< Number of inputs used for the A combiner */
+};
+
+
+/**
+ * Texture coord generation state.
+ */
+struct gl_texgen
+{
+   GLenum Mode;         /**< GL_EYE_LINEAR, GL_SPHERE_MAP, etc */
+   GLbitfield _ModeBit; /**< TEXGEN_x bit corresponding to Mode */
+   GLfloat ObjectPlane[4];
+   GLfloat EyePlane[4];
+};
+
+
+/**
+ * Texture unit state.  Contains enable flags, texture environment/function/
+ * combiners, texgen state, pointers to current texture objects and
+ * post-filter color tables.
+ */
+struct gl_texture_unit
+{
+   GLbitfield Enabled;          /**< bitmask of TEXTURE_*_BIT flags */
+   GLbitfield _ReallyEnabled;   /**< 0 or exactly one of TEXTURE_*_BIT flags */
+
+   GLenum EnvMode;              /**< GL_MODULATE, GL_DECAL, GL_BLEND, etc. */
+   GLfloat EnvColor[4];
+
+   struct gl_texgen GenS;
+   struct gl_texgen GenT;
+   struct gl_texgen GenR;
+   struct gl_texgen GenQ;
+   GLbitfield TexGenEnabled;	/**< Bitwise-OR of [STRQ]_BIT values */
+   GLbitfield _GenFlags;	/**< Bitwise-OR of Gen[STRQ]._ModeBit */
+
+   GLfloat LodBias;		/**< for biasing mipmap levels */
+   GLenum BumpTarget;
+   GLfloat RotMatrix[4]; /* 2x2 matrix */
+
+   /** 
+    * \name GL_EXT_texture_env_combine 
+    */
+   struct gl_tex_env_combine_state Combine;
+
+   /**
+    * Derived state based on \c EnvMode and the \c BaseFormat of the
+    * currently enabled texture.
+    */
+   struct gl_tex_env_combine_state _EnvMode;
+
+   /**
+    * Currently enabled combiner state.  This will point to either
+    * \c Combine or \c _EnvMode.
+    */
+   struct gl_tex_env_combine_state *_CurrentCombine;
+
+   /** Current texture object pointers */
+   struct gl_texture_object *CurrentTex[NUM_TEXTURE_TARGETS];
+
+   /** Points to highest priority, complete and enabled texture object */
+   struct gl_texture_object *_Current;
+
+   /** GL_SGI_texture_color_table */
+   /*@{*/
+   struct gl_color_table ColorTable;
+   struct gl_color_table ProxyColorTable;
+   GLboolean ColorTableEnabled;
+   /*@}*/
+};
+
+
+/**
+ * Texture attribute group (GL_TEXTURE_BIT).
+ */
+struct gl_texture_attrib
+{
+   GLuint CurrentUnit;   /**< GL_ACTIVE_TEXTURE */
+   struct gl_texture_unit Unit[MAX_COMBINED_TEXTURE_IMAGE_UNITS];
+
+   struct gl_texture_object *ProxyTex[NUM_TEXTURE_TARGETS];
+
+   /** GL_ARB_seamless_cubemap */
+   GLboolean CubeMapSeamless;
+
+   /** GL_EXT_shared_texture_palette */
+   GLboolean SharedPalette;
+   struct gl_color_table Palette;
+
+   /** Texture units/samplers used by vertex or fragment texturing */
+   GLbitfield _EnabledUnits;
+
+   /** Texture coord units/sets used for fragment texturing */
+   GLbitfield _EnabledCoordUnits;
+
+   /** Texture coord units that have texgen enabled */
+   GLbitfield _TexGenEnabled;
+
+   /** Texture coord units that have non-identity matrices */
+   GLbitfield _TexMatEnabled;
+
+   /** Bitwise-OR of all Texture.Unit[i]._GenFlags */
+   GLbitfield _GenFlags;
+};
+
+
+/**
+ * Transformation attribute group (GL_TRANSFORM_BIT).
+ */
+struct gl_transform_attrib
+{
+   GLenum MatrixMode;				/**< Matrix mode */
+   GLfloat EyeUserPlane[MAX_CLIP_PLANES][4];	/**< User clip planes */
+   GLfloat _ClipUserPlane[MAX_CLIP_PLANES][4];	/**< derived */
+   GLbitfield ClipPlanesEnabled;                /**< on/off bitmask */
+   GLboolean Normalize;				/**< Normalize all normals? */
+   GLboolean RescaleNormals;			/**< GL_EXT_rescale_normal */
+   GLboolean RasterPositionUnclipped;           /**< GL_IBM_rasterpos_clip */
+   GLboolean DepthClamp;			/**< GL_ARB_depth_clamp */
+
+   GLfloat CullEyePos[4];
+   GLfloat CullObjPos[4];
+};
+
+
+/**
+ * Viewport attribute group (GL_VIEWPORT_BIT).
+ */
+struct gl_viewport_attrib
+{
+   GLint X, Y;			/**< position */
+   GLsizei Width, Height;	/**< size */
+   GLfloat Near, Far;		/**< Depth buffer range */
+   GLmatrix _WindowMap;		/**< Mapping transformation as a matrix. */
+};
+
+
+/**
+ * GL_ARB_vertex/pixel_buffer_object buffer object
+ */
+struct gl_buffer_object
+{
+   _glthread_Mutex Mutex;
+   GLint RefCount;
+   GLuint Name;
+   GLenum Usage;        /**< GL_STREAM_DRAW_ARB, GL_STREAM_READ_ARB, etc. */
+   GLsizeiptrARB Size;  /**< Size of buffer storage in bytes */
+   GLubyte *Data;       /**< Location of storage either in RAM or VRAM. */
+   /** Fields describing a mapped buffer */
+   /*@{*/
+   GLbitfield AccessFlags; /**< Mask of GL_MAP_x_BIT flags */
+   GLvoid *Pointer;     /**< User-space address of mapping */
+   GLintptr Offset;     /**< Mapped offset */
+   GLsizeiptr Length;   /**< Mapped length */
+   /*@}*/
+   GLboolean Written;   /**< Ever written to? (for debugging) */
+   GLboolean Purgeable; /**< Is the buffer purgeable under memory pressure? */
+};
+
+
+/**
+ * Client pixel packing/unpacking attributes
+ */
+struct gl_pixelstore_attrib
+{
+   GLint Alignment;
+   GLint RowLength;
+   GLint SkipPixels;
+   GLint SkipRows;
+   GLint ImageHeight;
+   GLint SkipImages;
+   GLboolean SwapBytes;
+   GLboolean LsbFirst;
+   GLboolean ClientStorage; /**< GL_APPLE_client_storage */
+   GLboolean Invert;        /**< GL_MESA_pack_invert */
+   struct gl_buffer_object *BufferObj; /**< GL_ARB_pixel_buffer_object */
+};
+
+
+/**
+ * Client vertex array attributes
+ */
+struct gl_client_array
+{
+   GLint Size;                  /**< components per element (1,2,3,4) */
+   GLenum Type;                 /**< datatype: GL_FLOAT, GL_INT, etc */
+   GLenum Format;               /**< default: GL_RGBA, but may be GL_BGRA */
+   GLsizei Stride;		/**< user-specified stride */
+   GLsizei StrideB;		/**< actual stride in bytes */
+   const GLubyte *Ptr;          /**< Points to array data */
+   GLboolean Enabled;		/**< Enabled flag is a boolean */
+   GLboolean Normalized;        /**< GL_ARB_vertex_program */
+   GLboolean Integer;           /**< Integer-valued? */
+   GLuint InstanceDivisor;      /**< GL_ARB_instanced_arrays */
+   GLuint _ElementSize;         /**< size of each element in bytes */
+
+   struct gl_buffer_object *BufferObj;/**< GL_ARB_vertex_buffer_object */
+   GLuint _MaxElement;          /**< max element index into array buffer + 1 */
+};
+
+
+/**
+ * Collection of vertex arrays.  Defined by the GL_APPLE_vertex_array_object
+ * extension, but a nice encapsulation in any case.
+ */
+struct gl_array_object
+{
+   /** Name of the array object as received from glGenVertexArrayAPPLE. */
+   GLuint Name;
+
+   GLint RefCount;
+   _glthread_Mutex Mutex;
+   GLboolean VBOonly;  /**< require all arrays to live in VBOs? */
+
+   /** Conventional vertex arrays */
+   /*@{*/
+   struct gl_client_array Vertex;
+   struct gl_client_array Weight;
+   struct gl_client_array Normal;
+   struct gl_client_array Color;
+   struct gl_client_array SecondaryColor;
+   struct gl_client_array FogCoord;
+   struct gl_client_array Index;
+   struct gl_client_array EdgeFlag;
+   struct gl_client_array TexCoord[MAX_TEXTURE_COORD_UNITS];
+   struct gl_client_array PointSize;
+   /*@}*/
+
+   /**
+    * Generic arrays for vertex programs/shaders.
+    * For NV vertex programs, these attributes alias and take priority
+    * over the conventional attribs above.  For ARB vertex programs and
+    * GLSL vertex shaders, these attributes are separate.
+    */
+   struct gl_client_array VertexAttrib[MAX_VERTEX_GENERIC_ATTRIBS];
+
+   /** Mask of _NEW_ARRAY_* values indicating which arrays are enabled */
+   GLbitfield _Enabled;
+
+   /**
+    * Min of all enabled arrays' _MaxElement.  When arrays reside inside VBOs
+    * we can determine the max legal (in bounds) glDrawElements array index.
+    */
+   GLuint _MaxElement;
+};
+
+
+/**
+ * Vertex array state
+ */
+struct gl_array_attrib
+{
+   /** Currently bound array object. See _mesa_BindVertexArrayAPPLE() */
+   struct gl_array_object *ArrayObj;
+
+   /** The default vertex array object */
+   struct gl_array_object *DefaultArrayObj;
+
+   /** Array objects (GL_ARB/APPLE_vertex_array_object) */
+   struct _mesa_HashTable *Objects;
+
+   GLint ActiveTexture;		/**< Client Active Texture */
+   GLuint LockFirst;            /**< GL_EXT_compiled_vertex_array */
+   GLuint LockCount;            /**< GL_EXT_compiled_vertex_array */
+
+   /** GL 3.1 (slightly different from GL_NV_primitive_restart) */
+   GLboolean PrimitiveRestart;
+   GLuint RestartIndex;
+
+   GLbitfield NewState;		/**< mask of _NEW_ARRAY_* values */
+   GLboolean RebindArrays; /**< whether the VBO module should rebind arrays */
+
+   /* GL_ARB_vertex_buffer_object */
+   struct gl_buffer_object *ArrayBufferObj;
+   struct gl_buffer_object *ElementArrayBufferObj;
+};
+
+
+/**
+ * Feedback buffer state
+ */
+struct gl_feedback
+{
+   GLenum Type;
+   GLbitfield _Mask;    /**< FB_* bits */
+   GLfloat *Buffer;
+   GLuint BufferSize;
+   GLuint Count;
+};
+
+
+/**
+ * Selection buffer state
+ */
+struct gl_selection
+{
+   GLuint *Buffer;	/**< selection buffer */
+   GLuint BufferSize;	/**< size of the selection buffer */
+   GLuint BufferCount;	/**< number of values in the selection buffer */
+   GLuint Hits;		/**< number of records in the selection buffer */
+   GLuint NameStackDepth; /**< name stack depth */
+   GLuint NameStack[MAX_NAME_STACK_DEPTH]; /**< name stack */
+   GLboolean HitFlag;	/**< hit flag */
+   GLfloat HitMinZ;	/**< minimum hit depth */
+   GLfloat HitMaxZ;	/**< maximum hit depth */
+};
+
+
+/**
+ * 1-D Evaluator control points
+ */
+struct gl_1d_map
+{
+   GLuint Order;	/**< Number of control points */
+   GLfloat u1, u2, du;	/**< u1, u2, 1.0/(u2-u1) */
+   GLfloat *Points;	/**< Points to contiguous control points */
+};
+
+
+/**
+ * 2-D Evaluator control points
+ */
+struct gl_2d_map
+{
+   GLuint Uorder;		/**< Number of control points in U dimension */
+   GLuint Vorder;		/**< Number of control points in V dimension */
+   GLfloat u1, u2, du;
+   GLfloat v1, v2, dv;
+   GLfloat *Points;		/**< Points to contiguous control points */
+};
+
+
+/**
+ * All evaluator control point state
+ */
+struct gl_evaluators
+{
+   /** 
+    * \name 1-D maps
+    */
+   /*@{*/
+   struct gl_1d_map Map1Vertex3;
+   struct gl_1d_map Map1Vertex4;
+   struct gl_1d_map Map1Index;
+   struct gl_1d_map Map1Color4;
+   struct gl_1d_map Map1Normal;
+   struct gl_1d_map Map1Texture1;
+   struct gl_1d_map Map1Texture2;
+   struct gl_1d_map Map1Texture3;
+   struct gl_1d_map Map1Texture4;
+   struct gl_1d_map Map1Attrib[16];  /**< GL_NV_vertex_program */
+   /*@}*/
+
+   /** 
+    * \name 2-D maps 
+    */
+   /*@{*/
+   struct gl_2d_map Map2Vertex3;
+   struct gl_2d_map Map2Vertex4;
+   struct gl_2d_map Map2Index;
+   struct gl_2d_map Map2Color4;
+   struct gl_2d_map Map2Normal;
+   struct gl_2d_map Map2Texture1;
+   struct gl_2d_map Map2Texture2;
+   struct gl_2d_map Map2Texture3;
+   struct gl_2d_map Map2Texture4;
+   struct gl_2d_map Map2Attrib[16];  /**< GL_NV_vertex_program */
+   /*@}*/
+};
+
+
+/**
+ * Names of the various vertex/fragment program register files, etc.
+ *
+ * NOTE: first four tokens must fit into 2 bits (see t_vb_arbprogram.c)
+ * All values should fit in a 4-bit field.
+ *
+ * NOTE: PROGRAM_ENV_PARAM, PROGRAM_STATE_VAR, PROGRAM_NAMED_PARAM,
+ * PROGRAM_CONSTANT, and PROGRAM_UNIFORM can all be considered to
+ * be "uniform" variables since they can only be set outside glBegin/End.
+ * They're also all stored in the same Parameters array.
+ */
+typedef enum
+{
+   PROGRAM_TEMPORARY,   /**< machine->Temporary[] */
+   PROGRAM_INPUT,       /**< machine->Inputs[] */
+   PROGRAM_OUTPUT,      /**< machine->Outputs[] */
+   PROGRAM_VARYING,     /**< machine->Inputs[]/Outputs[] */
+   PROGRAM_LOCAL_PARAM, /**< gl_program->LocalParams[] */
+   PROGRAM_ENV_PARAM,   /**< gl_program->Parameters[] */
+   PROGRAM_STATE_VAR,   /**< gl_program->Parameters[] */
+   PROGRAM_NAMED_PARAM, /**< gl_program->Parameters[] */
+   PROGRAM_CONSTANT,    /**< gl_program->Parameters[] */
+   PROGRAM_UNIFORM,     /**< gl_program->Parameters[] */
+   PROGRAM_WRITE_ONLY,  /**< A dummy, write-only register */
+   PROGRAM_ADDRESS,     /**< machine->AddressReg */
+   PROGRAM_SAMPLER,     /**< for shader samplers, compile-time only */
+   PROGRAM_SYSTEM_VALUE,/**< InstanceId, PrimitiveID, etc. */
+   PROGRAM_UNDEFINED,   /**< Invalid/TBD value */
+   PROGRAM_FILE_MAX
+} gl_register_file;
+
+
+/**
+ * If the register file is PROGRAM_SYSTEM_VALUE, the register index will be
+ * one of these values.
+ */
+typedef enum
+{
+   SYSTEM_VALUE_FRONT_FACE,  /**< Fragment shader only (not done yet) */
+   SYSTEM_VALUE_INSTANCE_ID, /**< Vertex shader only */
+   SYSTEM_VALUE_MAX          /**< Number of values */
+} gl_system_value;
+
+
+/** Vertex and fragment instructions */
+struct prog_instruction;
+struct gl_program_parameter_list;
+struct gl_uniform_list;
+
+
+/**
+ * Base class for any kind of program object
+ */
+struct gl_program
+{
+   GLuint Id;
+   GLubyte *String;  /**< Null-terminated program text */
+   GLint RefCount;
+   GLenum Target;    /**< GL_VERTEX/FRAGMENT_PROGRAM_ARB, GL_FRAGMENT_PROGRAM_NV */
+   GLenum Format;    /**< String encoding format */
+   GLboolean Resident;
+
+   struct prog_instruction *Instructions;
+
+   GLbitfield InputsRead;     /**< Bitmask of which input regs are read */
+   GLbitfield64 OutputsWritten; /**< Bitmask of which output regs are written */
+   GLbitfield SystemValuesRead;   /**< Bitmask of SYSTEM_VALUE_x inputs used */
+   GLbitfield InputFlags[MAX_PROGRAM_INPUTS];   /**< PROG_PARAM_BIT_x flags */
+   GLbitfield OutputFlags[MAX_PROGRAM_OUTPUTS]; /**< PROG_PARAM_BIT_x flags */
+   GLbitfield TexturesUsed[MAX_TEXTURE_UNITS];  /**< TEXTURE_x_BIT bitmask */
+   GLbitfield SamplersUsed;   /**< Bitfield of which samplers are used */
+   GLbitfield ShadowSamplers; /**< Texture units used for shadow sampling. */
+
+
+   /** Named parameters, constants, etc. from program text */
+   struct gl_program_parameter_list *Parameters;
+   /** Numbered local parameters */
+   GLfloat LocalParams[MAX_PROGRAM_LOCAL_PARAMS][4];
+
+   /** Vertex/fragment shader varying vars */
+   struct gl_program_parameter_list *Varying;
+   /** Vertex program user-defined attributes */
+   struct gl_program_parameter_list *Attributes;
+
+   /** Map from sampler unit to texture unit (set by glUniform1i()) */
+   GLubyte SamplerUnits[MAX_SAMPLERS];
+   /** Which texture target is being sampled (TEXTURE_1D/2D/3D/etc_INDEX) */
+   gl_texture_index SamplerTargets[MAX_SAMPLERS];
+
+   /** Bitmask of which register files are read/written with indirect
+    * addressing.  Mask of (1 << PROGRAM_x) bits.
+    */
+   GLbitfield IndirectRegisterFiles;
+
+   /** Logical counts */
+   /*@{*/
+   GLuint NumInstructions;
+   GLuint NumTemporaries;
+   GLuint NumParameters;
+   GLuint NumAttributes;
+   GLuint NumAddressRegs;
+   GLuint NumAluInstructions;
+   GLuint NumTexInstructions;
+   GLuint NumTexIndirections;
+   /*@}*/
+   /** Native, actual h/w counts */
+   /*@{*/
+   GLuint NumNativeInstructions;
+   GLuint NumNativeTemporaries;
+   GLuint NumNativeParameters;
+   GLuint NumNativeAttributes;
+   GLuint NumNativeAddressRegs;
+   GLuint NumNativeAluInstructions;
+   GLuint NumNativeTexInstructions;
+   GLuint NumNativeTexIndirections;
+   /*@}*/
+};
+
+
+/** Vertex program object */
+struct gl_vertex_program
+{
+   struct gl_program Base;   /**< base class */
+   GLboolean IsNVProgram;    /**< is this a GL_NV_vertex_program program? */
+   GLboolean IsPositionInvariant;
+};
+
+
+/** Geometry program object */
+struct gl_geometry_program
+{
+   struct gl_program Base;   /**< base class */
+
+   GLint VerticesOut;
+   GLenum InputType;  /**< GL_POINTS, GL_LINES, GL_LINES_ADJACENCY_ARB,
+                           GL_TRIANGLES, or GL_TRIANGLES_ADJACENCY_ARB */
+   GLenum OutputType; /**< GL_POINTS, GL_LINE_STRIP or GL_TRIANGLE_STRIP */
+};
+
+
+/** Fragment program object */
+struct gl_fragment_program
+{
+   struct gl_program Base;   /**< base class */
+   GLenum FogOption;
+   GLboolean UsesKill;          /**< shader uses KIL instruction */
+   GLboolean OriginUpperLeft;
+   GLboolean PixelCenterInteger;
+   enum gl_frag_depth_layout FragDepthLayout;
+};
+
+
+/**
+ * State common to vertex and fragment programs.
+ */
+struct gl_program_state
+{
+   GLint ErrorPos;                       /* GL_PROGRAM_ERROR_POSITION_ARB/NV */
+   const char *ErrorString;              /* GL_PROGRAM_ERROR_STRING_ARB/NV */
+};
+
+
+/**
+ * Context state for vertex programs.
+ */
+struct gl_vertex_program_state
+{
+   GLboolean Enabled;            /**< User-set GL_VERTEX_PROGRAM_ARB/NV flag */
+   GLboolean _Enabled;           /**< Enabled and _valid_ user program? */
+   GLboolean PointSizeEnabled;   /**< GL_VERTEX_PROGRAM_POINT_SIZE_ARB/NV */
+   GLboolean TwoSideEnabled;     /**< GL_VERTEX_PROGRAM_TWO_SIDE_ARB/NV */
+   struct gl_vertex_program *Current;  /**< User-bound vertex program */
+
+   /** Currently enabled and valid vertex program (including internal
+    * programs, user-defined vertex programs and GLSL vertex shaders).
+    * This is the program we must use when rendering.
+    */
+   struct gl_vertex_program *_Current;
+
+   GLfloat Parameters[MAX_PROGRAM_ENV_PARAMS][4]; /**< Env params */
+
+   /* For GL_NV_vertex_program only: */
+   GLenum TrackMatrix[MAX_PROGRAM_ENV_PARAMS / 4];
+   GLenum TrackMatrixTransform[MAX_PROGRAM_ENV_PARAMS / 4];
+
+   /** Should fixed-function T&L be implemented with a vertex prog? */
+   GLboolean _MaintainTnlProgram;
+
+   /** Program to emulate fixed-function T&L (see above) */
+   struct gl_vertex_program *_TnlProgram;
+
+   /** Cache of fixed-function programs */
+   struct gl_program_cache *Cache;
+
+   GLboolean _Overriden;
+};
+
+
+/**
+ * Context state for geometry programs.
+ */
+struct gl_geometry_program_state
+{
+   GLboolean Enabled;               /**< GL_ARB_GEOMETRY_SHADER4 */
+   GLboolean _Enabled;              /**< Enabled and valid program? */
+   struct gl_geometry_program *Current;  /**< user-bound geometry program */
+
+   /** Currently enabled and valid program (including internal programs
+    * and compiled shader programs).
+    */
+   struct gl_geometry_program *_Current;
+
+   GLfloat Parameters[MAX_PROGRAM_ENV_PARAMS][4]; /**< Env params */
+
+   /** Cache of fixed-function programs */
+   struct gl_program_cache *Cache;
+};
+
+/**
+ * Context state for fragment programs.
+ */
+struct gl_fragment_program_state
+{
+   GLboolean Enabled;     /**< User-set fragment program enable flag */
+   GLboolean _Enabled;    /**< Enabled and _valid_ user program? */
+   struct gl_fragment_program *Current;  /**< User-bound fragment program */
+
+   /** Currently enabled and valid fragment program (including internal
+    * programs, user-defined fragment programs and GLSL fragment shaders).
+    * This is the program we must use when rendering.
+    */
+   struct gl_fragment_program *_Current;
+
+   GLfloat Parameters[MAX_PROGRAM_ENV_PARAMS][4]; /**< Env params */
+
+   /** Should fixed-function texturing be implemented with a fragment prog? */
+   GLboolean _MaintainTexEnvProgram;
+
+   /** Program to emulate fixed-function texture env/combine (see above) */
+   struct gl_fragment_program *_TexEnvProgram;
+
+   /** Cache of fixed-function programs */
+   struct gl_program_cache *Cache;
+};
+
+
+/**
+ * ATI_fragment_shader runtime state
+ */
+#define ATI_FS_INPUT_PRIMARY 0
+#define ATI_FS_INPUT_SECONDARY 1
+
+struct atifs_instruction;
+struct atifs_setupinst;
+
+/**
+ * ATI fragment shader
+ */
+struct ati_fragment_shader
+{
+   GLuint Id;
+   GLint RefCount;
+   struct atifs_instruction *Instructions[2];
+   struct atifs_setupinst *SetupInst[2];
+   GLfloat Constants[8][4];
+   GLbitfield LocalConstDef;  /**< Indicates which constants have been set */
+   GLubyte numArithInstr[2];
+   GLubyte regsAssigned[2];
+   GLubyte NumPasses;         /**< 1 or 2 */
+   GLubyte cur_pass;
+   GLubyte last_optype;
+   GLboolean interpinp1;
+   GLboolean isValid;
+   GLuint swizzlerq;
+};
+
+/**
+ * Context state for GL_ATI_fragment_shader
+ */
+struct gl_ati_fragment_shader_state
+{
+   GLboolean Enabled;
+   GLboolean _Enabled;                  /**< enabled and valid shader? */
+   GLboolean Compiling;
+   GLfloat GlobalConstants[8][4];
+   struct ati_fragment_shader *Current;
+};
+
+
+/**
+ * Occlusion/timer query object.
+ */
+struct gl_query_object
+{
+   GLenum Target;      /**< The query target, when active */
+   GLuint Id;          /**< hash table ID/name */
+   GLuint64EXT Result; /**< the counter */
+   GLboolean Active;   /**< inside Begin/EndQuery */
+   GLboolean Ready;    /**< result is ready? */
+};
+
+
+/**
+ * Context state for query objects.
+ */
+struct gl_query_state
+{
+   struct _mesa_HashTable *QueryObjects;
+   struct gl_query_object *CurrentOcclusionObject; /* GL_ARB_occlusion_query */
+   struct gl_query_object *CurrentTimerObject;     /* GL_EXT_timer_query */
+
+   /** GL_NV_conditional_render */
+   struct gl_query_object *CondRenderQuery;
+
+   /** GL_EXT_transform_feedback */
+   struct gl_query_object *PrimitivesGenerated;
+   struct gl_query_object *PrimitivesWritten;
+
+   /** GL_ARB_timer_query */
+   struct gl_query_object *TimeElapsed;
+
+   GLenum CondRenderMode;
+};
+
+
+/** Sync object state */
+struct gl_sync_object {
+   struct simple_node link;
+   GLenum Type;               /**< GL_SYNC_FENCE */
+   GLuint Name;               /**< Fence name */
+   GLint RefCount;            /**< Reference count */
+   GLboolean DeletePending;   /**< Object was deleted while there were still
+			       * live references (e.g., sync not yet finished)
+			       */
+   GLenum SyncCondition;
+   GLbitfield Flags;          /**< Flags passed to glFenceSync */
+   GLuint StatusFlag:1;       /**< Has the sync object been signaled? */
+};
+
+
+/** Set by #pragma directives */
+struct gl_sl_pragmas
+{
+   GLboolean IgnoreOptimize;  /**< ignore #pragma optimize(on/off) ? */
+   GLboolean IgnoreDebug;     /**< ignore #pragma debug(on/off) ? */
+   GLboolean Optimize;  /**< defaults on */
+   GLboolean Debug;     /**< defaults off */
+};
+
+
+/**
+ * A GLSL vertex or fragment shader object.
+ */
+struct gl_shader
+{
+   GLenum Type;  /**< GL_FRAGMENT_SHADER || GL_VERTEX_SHADER || GL_GEOMETRY_SHADER_ARB (first field!) */
+   GLuint Name;  /**< AKA the handle */
+   GLint RefCount;  /**< Reference count */
+   GLboolean DeletePending;
+   GLboolean CompileStatus;
+   const GLchar *Source;  /**< Source code string */
+   GLuint SourceChecksum;       /**< for debug/logging purposes */
+   struct gl_program *Program;  /**< Post-compile assembly code */
+   GLchar *InfoLog;
+   struct gl_sl_pragmas Pragmas;
+
+   unsigned Version;       /**< GLSL version used for linking */
+
+   struct exec_list *ir;
+   struct glsl_symbol_table *symbols;
+
+   /** Shaders containing built-in functions that are used for linking. */
+   struct gl_shader *builtins_to_link[16];
+   unsigned num_builtins_to_link;
+};
+
+
+/**
+ * A GLSL program object.
+ * Basically a linked collection of vertex and fragment shaders.
+ */
+struct gl_shader_program
+{
+   GLenum Type;  /**< Always GL_SHADER_PROGRAM (internal token) */
+   GLuint Name;  /**< aka handle or ID */
+   GLint RefCount;  /**< Reference count */
+   GLboolean DeletePending;
+
+   GLuint NumShaders;          /**< number of attached shaders */
+   struct gl_shader **Shaders; /**< List of attached the shaders */
+
+   /** User-defined attribute bindings (glBindAttribLocation) */
+   struct gl_program_parameter_list *Attributes;
+
+   /** Transform feedback varyings */
+   struct {
+      GLenum BufferMode;
+      GLuint NumVarying;
+      GLchar **VaryingNames;  /**< Array [NumVarying] of char * */
+   } TransformFeedback;
+
+   /** Geometry shader state - copied into gl_geometry_program at link time */
+   struct {
+      GLint VerticesOut;
+      GLenum InputType;  /**< GL_POINTS, GL_LINES, GL_LINES_ADJACENCY_ARB,
+                              GL_TRIANGLES, or GL_TRIANGLES_ADJACENCY_ARB */
+      GLenum OutputType; /**< GL_POINTS, GL_LINE_STRIP or GL_TRIANGLE_STRIP */
+   } Geom;
+
+   /* post-link info: */
+   struct gl_vertex_program *VertexProgram;     /**< Linked vertex program */
+   struct gl_fragment_program *FragmentProgram; /**< Linked fragment prog */
+   struct gl_geometry_program *GeometryProgram; /**< Linked geometry prog */
+   struct gl_uniform_list *Uniforms;
+   struct gl_program_parameter_list *Varying;
+   GLboolean LinkStatus;   /**< GL_LINK_STATUS */
+   GLboolean Validated;
+   GLboolean _Used;        /**< Ever used for drawing? */
+   GLchar *InfoLog;
+
+   unsigned Version;       /**< GLSL version used for linking */
+
+   /**
+    * Per-stage shaders resulting from the first stage of linking.
+    *
+    * Set of linked shaders for this program.  The array is accessed using the
+    * \c MESA_SHADER_* defines.  Entries for non-existent stages will be
+    * \c NULL.
+    */
+   struct gl_shader *_LinkedShaders[MESA_SHADER_TYPES];
+};   
+
+
+#define GLSL_DUMP      0x1  /**< Dump shaders to stdout */
+#define GLSL_LOG       0x2  /**< Write shaders to files */
+#define GLSL_OPT       0x4  /**< Force optimizations (override pragmas) */
+#define GLSL_NO_OPT    0x8  /**< Force no optimizations (override pragmas) */
+#define GLSL_UNIFORMS 0x10  /**< Print glUniform calls */
+#define GLSL_NOP_VERT 0x20  /**< Force no-op vertex shaders */
+#define GLSL_NOP_FRAG 0x40  /**< Force no-op fragment shaders */
+#define GLSL_USE_PROG 0x80  /**< Log glUseProgram calls */
+
+
+/**
+ * Context state for GLSL vertex/fragment shaders.
+ */
+struct gl_shader_state
+{
+   /**
+    * Programs used for rendering
+    *
+    * There is a separate program set for each shader stage.  If
+    * GL_EXT_separate_shader_objects is not supported, each of these must point
+    * to \c NULL or to the same program.
+    */
+   struct gl_shader_program *CurrentVertexProgram;
+   struct gl_shader_program *CurrentGeometryProgram;
+   struct gl_shader_program *CurrentFragmentProgram;
+
+   /**
+    * Program used by glUniform calls.
+    *
+    * Explicitly set by \c glUseProgram and \c glActiveProgramEXT.
+    */
+   struct gl_shader_program *ActiveProgram;
+
+   void *MemPool;
+
+   GLbitfield Flags;                    /**< Mask of GLSL_x flags */
+};
+
+/**
+ * Compiler options for a single GLSL shaders type
+ */
+struct gl_shader_compiler_options
+{
+   /** Driver-selectable options: */
+   GLboolean EmitCondCodes;             /**< Use condition codes? */
+   GLboolean EmitNVTempInitialization;  /**< 0-fill NV temp registers */
+   /**
+    * Attempts to flatten all ir_if (OPCODE_IF) for GPUs that can't
+    * support control flow.
+    */
+   GLboolean EmitNoIfs;
+   GLboolean EmitNoLoops;
+   GLboolean EmitNoFunctions;
+   GLboolean EmitNoCont;                  /**< Emit CONT opcode? */
+   GLboolean EmitNoMainReturn;            /**< Emit CONT/RET opcodes? */
+   GLboolean EmitNoNoise;                 /**< Emit NOISE opcodes? */
+   GLboolean EmitNoPow;                   /**< Emit POW opcodes? */
+
+   /**
+    * \name Forms of indirect addressing the driver cannot do.
+    */
+   /*@{*/
+   GLboolean EmitNoIndirectInput;   /**< No indirect addressing of inputs */
+   GLboolean EmitNoIndirectOutput;  /**< No indirect addressing of outputs */
+   GLboolean EmitNoIndirectTemp;    /**< No indirect addressing of temps */
+   GLboolean EmitNoIndirectUniform; /**< No indirect addressing of constants */
+   /*@}*/
+
+   GLuint MaxUnrollIterations;
+
+   struct gl_sl_pragmas DefaultPragmas; /**< Default #pragma settings */
+};
+
+/**
+ * Transform feedback object state
+ */
+struct gl_transform_feedback_object
+{
+   GLuint Name;  /**< AKA the object ID */
+   GLint RefCount;
+   GLboolean Active;  /**< Is transform feedback enabled? */
+   GLboolean Paused;  /**< Is transform feedback paused? */
+
+   /** The feedback buffers */
+   GLuint BufferNames[MAX_FEEDBACK_ATTRIBS];
+   struct gl_buffer_object *Buffers[MAX_FEEDBACK_ATTRIBS];
+
+   /** Start of feedback data in dest buffer */
+   GLintptr Offset[MAX_FEEDBACK_ATTRIBS];
+   /** Max data to put into dest buffer (in bytes) */
+   GLsizeiptr Size[MAX_FEEDBACK_ATTRIBS];
+};
+
+
+/**
+ * Context state for transform feedback.
+ */
+struct gl_transform_feedback
+{
+   GLenum Mode;       /**< GL_POINTS, GL_LINES or GL_TRIANGLES */
+
+   GLboolean RasterDiscard;  /**< GL_RASTERIZER_DISCARD */
+
+   /** The general binding point (GL_TRANSFORM_FEEDBACK_BUFFER) */
+   struct gl_buffer_object *CurrentBuffer;
+
+   /** The table of all transform feedback objects */
+   struct _mesa_HashTable *Objects;
+
+   /** The current xform-fb object (GL_TRANSFORM_FEEDBACK_BINDING) */
+   struct gl_transform_feedback_object *CurrentObject;
+
+   /** The default xform-fb object (Name==0) */
+   struct gl_transform_feedback_object *DefaultObject;
+};
+
+
+
+/**
+ * State which can be shared by multiple contexts:
+ */
+struct gl_shared_state
+{
+   _glthread_Mutex Mutex;		   /**< for thread safety */
+   GLint RefCount;			   /**< Reference count */
+   struct _mesa_HashTable *DisplayList;	   /**< Display lists hash table */
+   struct _mesa_HashTable *TexObjects;	   /**< Texture objects hash table */
+
+   /** Default texture objects (shared by all texture units) */
+   struct gl_texture_object *DefaultTex[NUM_TEXTURE_TARGETS];
+
+   /** Fallback texture used when a bound texture is incomplete */
+   struct gl_texture_object *FallbackTex;
+
+   /**
+    * \name Thread safety and statechange notification for texture
+    * objects. 
+    *
+    * \todo Improve the granularity of locking.
+    */
+   /*@{*/
+   _glthread_Mutex TexMutex;		/**< texobj thread safety */
+   GLuint TextureStateStamp;	        /**< state notification for shared tex */
+   /*@}*/
+
+   /** Default buffer object for vertex arrays that aren't in VBOs */
+   struct gl_buffer_object *NullBufferObj;
+
+   /**
+    * \name Vertex/geometry/fragment programs
+    */
+   /*@{*/
+   struct _mesa_HashTable *Programs; /**< All vertex/fragment programs */
+   struct gl_vertex_program *DefaultVertexProgram;
+   struct gl_fragment_program *DefaultFragmentProgram;
+   struct gl_geometry_program *DefaultGeometryProgram;
+   /*@}*/
+
+   /* GL_ATI_fragment_shader */
+   struct _mesa_HashTable *ATIShaders;
+   struct ati_fragment_shader *DefaultFragmentShader;
+
+   struct _mesa_HashTable *BufferObjects;
+
+   /** Table of both gl_shader and gl_shader_program objects */
+   struct _mesa_HashTable *ShaderObjects;
+
+   /* GL_EXT_framebuffer_object */
+   struct _mesa_HashTable *RenderBuffers;
+   struct _mesa_HashTable *FrameBuffers;
+
+   /* GL_ARB_sync */
+   struct simple_node SyncObjects;
+
+   void *DriverData;  /**< Device driver shared state */
+};
+
+
+
+
+/**
+ * A renderbuffer stores colors or depth values or stencil values.
+ * A framebuffer object will have a collection of these.
+ * Data are read/written to the buffer with a handful of Get/Put functions.
+ *
+ * Instances of this object are allocated with the Driver's NewRenderbuffer
+ * hook.  Drivers will likely wrap this class inside a driver-specific
+ * class to simulate inheritance.
+ */
+struct gl_renderbuffer
+{
+#define RB_MAGIC 0xaabbccdd
+   int Magic; /** XXX TEMPORARY DEBUG INFO */
+   _glthread_Mutex Mutex;		   /**< for thread safety */
+   GLuint ClassID;        /**< Useful for drivers */
+   GLuint Name;
+   GLint RefCount;
+   GLuint Width, Height;
+   GLboolean Purgeable;   /**< Is the buffer purgeable under memory pressure? */
+
+   GLenum InternalFormat; /**< The user-specified format */
+   GLenum _BaseFormat;    /**< Either GL_RGB, GL_RGBA, GL_DEPTH_COMPONENT or
+                               GL_STENCIL_INDEX. */
+   gl_format Format;      /**< The actual renderbuffer memory format */
+
+   GLubyte NumSamples;
+
+   GLenum DataType;      /**< Type of values passed to the Get/Put functions */
+   GLvoid *Data;        /**< This may not be used by some kinds of RBs */
+
+   /* Used to wrap one renderbuffer around another: */
+   struct gl_renderbuffer *Wrapped;
+
+   /* Delete this renderbuffer */
+   void (*Delete)(struct gl_renderbuffer *rb);
+
+   /* Allocate new storage for this renderbuffer */
+   GLboolean (*AllocStorage)(struct gl_context *ctx, struct gl_renderbuffer *rb,
+                             GLenum internalFormat,
+                             GLuint width, GLuint height);
+
+   /* Lock/Unlock are called before/after calling the Get/Put functions.
+    * Not sure this is the right place for these yet.
+   void (*Lock)(struct gl_context *ctx, struct gl_renderbuffer *rb);
+   void (*Unlock)(struct gl_context *ctx, struct gl_renderbuffer *rb);
+    */
+
+   /* Return a pointer to the element/pixel at (x,y).
+    * Should return NULL if the buffer memory can't be directly addressed.
+    */
+   void *(*GetPointer)(struct gl_context *ctx, struct gl_renderbuffer *rb,
+                       GLint x, GLint y);
+
+   /* Get/Read a row of values.
+    * The values will be of format _BaseFormat and type DataType.
+    */
+   void (*GetRow)(struct gl_context *ctx, struct gl_renderbuffer *rb, GLuint count,
+                  GLint x, GLint y, void *values);
+
+   /* Get/Read values at arbitrary locations.
+    * The values will be of format _BaseFormat and type DataType.
+    */
+   void (*GetValues)(struct gl_context *ctx, struct gl_renderbuffer *rb, GLuint count,
+                     const GLint x[], const GLint y[], void *values);
+
+   /* Put/Write a row of values.
+    * The values will be of format _BaseFormat and type DataType.
+    */
+   void (*PutRow)(struct gl_context *ctx, struct gl_renderbuffer *rb, GLuint count,
+                  GLint x, GLint y, const void *values, const GLubyte *mask);
+
+   /* Put/Write a row of RGB values.  This is a special-case routine that's
+    * only used for RGBA renderbuffers when the source data is GL_RGB. That's
+    * a common case for glDrawPixels and some triangle routines.
+    * The values will be of format GL_RGB and type DataType.
+    */
+   void (*PutRowRGB)(struct gl_context *ctx, struct gl_renderbuffer *rb, GLuint count,
+                    GLint x, GLint y, const void *values, const GLubyte *mask);
+
+
+   /* Put/Write a row of identical values.
+    * The values will be of format _BaseFormat and type DataType.
+    */
+   void (*PutMonoRow)(struct gl_context *ctx, struct gl_renderbuffer *rb, GLuint count,
+                     GLint x, GLint y, const void *value, const GLubyte *mask);
+
+   /* Put/Write values at arbitrary locations.
+    * The values will be of format _BaseFormat and type DataType.
+    */
+   void (*PutValues)(struct gl_context *ctx, struct gl_renderbuffer *rb, GLuint count,
+                     const GLint x[], const GLint y[], const void *values,
+                     const GLubyte *mask);
+   /* Put/Write identical values at arbitrary locations.
+    * The values will be of format _BaseFormat and type DataType.
+    */
+   void (*PutMonoValues)(struct gl_context *ctx, struct gl_renderbuffer *rb,
+                         GLuint count, const GLint x[], const GLint y[],
+                         const void *value, const GLubyte *mask);
+};
+
+
+/**
+ * A renderbuffer attachment points to either a texture object (and specifies
+ * a mipmap level, cube face or 3D texture slice) or points to a renderbuffer.
+ */
+struct gl_renderbuffer_attachment
+{
+   GLenum Type;  /**< \c GL_NONE or \c GL_TEXTURE or \c GL_RENDERBUFFER_EXT */
+   GLboolean Complete;
+
+   /**
+    * If \c Type is \c GL_RENDERBUFFER_EXT, this stores a pointer to the
+    * application supplied renderbuffer object.
+    */
+   struct gl_renderbuffer *Renderbuffer;
+
+   /**
+    * If \c Type is \c GL_TEXTURE, this stores a pointer to the application
+    * supplied texture object.
+    */
+   struct gl_texture_object *Texture;
+   GLuint TextureLevel; /**< Attached mipmap level. */
+   GLuint CubeMapFace;  /**< 0 .. 5, for cube map textures. */
+   GLuint Zoffset;      /**< Slice for 3D textures,  or layer for both 1D
+                         * and 2D array textures */
+};
+
+
+/**
+ * A framebuffer is a collection of renderbuffers (color, depth, stencil, etc).
+ * In C++ terms, think of this as a base class from which device drivers
+ * will make derived classes.
+ */
+struct gl_framebuffer
+{
+   _glthread_Mutex Mutex;  /**< for thread safety */
+   /**
+    * If zero, this is a window system framebuffer.  If non-zero, this
+    * is a FBO framebuffer; note that for some devices (i.e. those with
+    * a natural pixel coordinate system for FBOs that differs from the
+    * OpenGL/Mesa coordinate system), this means that the viewport,
+    * polygon face orientation, and polygon stipple will have to be inverted.
+    */
+   GLuint Name;
+
+   GLint RefCount;
+   GLboolean DeletePending;
+
+   /**
+    * The framebuffer's visual. Immutable if this is a window system buffer.
+    * Computed from attachments if user-made FBO.
+    */
+   struct gl_config Visual;
+
+   GLboolean Initialized;
+
+   GLuint Width, Height;	/**< size of frame buffer in pixels */
+
+   /** \name  Drawing bounds (Intersection of buffer size and scissor box) */
+   /*@{*/
+   GLint _Xmin, _Xmax;  /**< inclusive */
+   GLint _Ymin, _Ymax;  /**< exclusive */
+   /*@}*/
+
+   /** \name  Derived Z buffer stuff */
+   /*@{*/
+   GLuint _DepthMax;	/**< Max depth buffer value */
+   GLfloat _DepthMaxF;	/**< Float max depth buffer value */
+   GLfloat _MRD;	/**< minimum resolvable difference in Z values */
+   /*@}*/
+
+   /** One of the GL_FRAMEBUFFER_(IN)COMPLETE_* tokens */
+   GLenum _Status;
+
+   /** Integer color values */
+   GLboolean _IntegerColor;
+
+   /** Array of all renderbuffer attachments, indexed by BUFFER_* tokens. */
+   struct gl_renderbuffer_attachment Attachment[BUFFER_COUNT];
+
+   /* In unextended OpenGL these vars are part of the GL_COLOR_BUFFER
+    * attribute group and GL_PIXEL attribute group, respectively.
+    */
+   GLenum ColorDrawBuffer[MAX_DRAW_BUFFERS];
+   GLenum ColorReadBuffer;
+
+   /** Computed from ColorDraw/ReadBuffer above */
+   GLuint _NumColorDrawBuffers;
+   GLint _ColorDrawBufferIndexes[MAX_DRAW_BUFFERS]; /**< BUFFER_x or -1 */
+   GLint _ColorReadBufferIndex; /* -1 = None */
+   struct gl_renderbuffer *_ColorDrawBuffers[MAX_DRAW_BUFFERS];
+   struct gl_renderbuffer *_ColorReadBuffer;
+
+   /** The Actual depth/stencil buffers to use.  May be wrappers around the
+    * depth/stencil buffers attached above. */
+   struct gl_renderbuffer *_DepthBuffer;
+   struct gl_renderbuffer *_StencilBuffer;
+
+   /** Delete this framebuffer */
+   void (*Delete)(struct gl_framebuffer *fb);
+};
+
+
+/**
+ * Precision info for shader datatypes.  See glGetShaderPrecisionFormat().
+ */
+struct gl_precision
+{
+   GLushort RangeMin;   /**< min value exponent */
+   GLushort RangeMax;   /**< max value exponent */
+   GLushort Precision;  /**< number of mantissa bits */
+};
+
+
+/**
+ * Limits for vertex and fragment programs/shaders.
+ */
+struct gl_program_constants
+{
+   /* logical limits */
+   GLuint MaxInstructions;
+   GLuint MaxAluInstructions;
+   GLuint MaxTexInstructions;
+   GLuint MaxTexIndirections;
+   GLuint MaxAttribs;
+   GLuint MaxTemps;
+   GLuint MaxAddressRegs;
+   GLuint MaxParameters;
+   GLuint MaxLocalParams;
+   GLuint MaxEnvParams;
+   /* native/hardware limits */
+   GLuint MaxNativeInstructions;
+   GLuint MaxNativeAluInstructions;
+   GLuint MaxNativeTexInstructions;
+   GLuint MaxNativeTexIndirections;
+   GLuint MaxNativeAttribs;
+   GLuint MaxNativeTemps;
+   GLuint MaxNativeAddressRegs;
+   GLuint MaxNativeParameters;
+   /* For shaders */
+   GLuint MaxUniformComponents;
+   /* GL_ARB_geometry_shader4 */
+   GLuint MaxGeometryTextureImageUnits;
+   GLuint MaxGeometryVaryingComponents;
+   GLuint MaxVertexVaryingComponents;
+   GLuint MaxGeometryUniformComponents;
+   GLuint MaxGeometryOutputVertices;
+   GLuint MaxGeometryTotalOutputComponents;
+   /* ES 2.0 and GL_ARB_ES2_compatibility */
+   struct gl_precision LowFloat, MediumFloat, HighFloat;
+   struct gl_precision LowInt, MediumInt, HighInt;
+};
+
+
+/**
+ * Constants which may be overridden by device driver during context creation
+ * but are never changed after that.
+ */
+struct gl_constants
+{
+   GLint MaxTextureMbytes;      /**< Max memory per image, in MB */
+   GLint MaxTextureLevels;      /**< Max mipmap levels. */ 
+   GLint Max3DTextureLevels;    /**< Max mipmap levels for 3D textures */
+   GLint MaxCubeTextureLevels;  /**< Max mipmap levels for cube textures */
+   GLint MaxArrayTextureLayers; /**< Max layers in array textures */
+   GLint MaxTextureRectSize;    /**< Max rectangle texture size, in pixes */
+   GLuint MaxTextureCoordUnits;
+   GLuint MaxTextureImageUnits;
+   GLuint MaxVertexTextureImageUnits;
+   GLuint MaxCombinedTextureImageUnits;
+   GLuint MaxTextureUnits;           /**< = MIN(CoordUnits, ImageUnits) */
+   GLfloat MaxTextureMaxAnisotropy;  /**< GL_EXT_texture_filter_anisotropic */
+   GLfloat MaxTextureLodBias;        /**< GL_EXT_texture_lod_bias */
+
+   GLuint MaxArrayLockSize;
+
+   GLint SubPixelBits;
+
+   GLfloat MinPointSize, MaxPointSize;	     /**< aliased */
+   GLfloat MinPointSizeAA, MaxPointSizeAA;   /**< antialiased */
+   GLfloat PointSizeGranularity;
+   GLfloat MinLineWidth, MaxLineWidth;       /**< aliased */
+   GLfloat MinLineWidthAA, MaxLineWidthAA;   /**< antialiased */
+   GLfloat LineWidthGranularity;
+
+   GLuint MaxColorTableSize;
+
+   GLuint MaxClipPlanes;
+   GLuint MaxLights;
+   GLfloat MaxShininess;                     /**< GL_NV_light_max_exponent */
+   GLfloat MaxSpotExponent;                  /**< GL_NV_light_max_exponent */
+
+   GLuint MaxViewportWidth, MaxViewportHeight;
+
+   struct gl_program_constants VertexProgram;   /**< GL_ARB_vertex_program */
+   struct gl_program_constants FragmentProgram; /**< GL_ARB_fragment_program */
+   struct gl_program_constants GeometryProgram;  /**< GL_ARB_geometry_shader4 */
+   GLuint MaxProgramMatrices;
+   GLuint MaxProgramMatrixStackDepth;
+
+   /** vertex array / buffer object bounds checking */
+   GLboolean CheckArrayBounds;
+
+   GLuint MaxDrawBuffers;    /**< GL_ARB_draw_buffers */
+
+   GLuint MaxColorAttachments;   /**< GL_EXT_framebuffer_object */
+   GLuint MaxRenderbufferSize;   /**< GL_EXT_framebuffer_object */
+   GLuint MaxSamples;            /**< GL_ARB_framebuffer_object */
+
+   GLuint MaxVarying;  /**< Number of float[4] varying parameters */
+
+   GLuint GLSLVersion;  /**< GLSL version supported (ex: 120 = 1.20) */
+
+   /** Which texture units support GL_ATI_envmap_bumpmap as targets */
+   GLbitfield SupportedBumpUnits;
+
+   /**
+    * Maximum amount of time, measured in nanseconds, that the server can wait.
+    */
+   GLuint64 MaxServerWaitTimeout;
+
+   /** GL_EXT_provoking_vertex */
+   GLboolean QuadsFollowProvokingVertexConvention;
+
+   /** OpenGL version 3.0 */
+   GLbitfield ContextFlags;  /**< Ex: GL_CONTEXT_FLAG_FORWARD_COMPATIBLE_BIT */
+
+   /** OpenGL version 3.2 */
+   GLbitfield ProfileMask;   /**< Mask of CONTEXT_x_PROFILE_BIT */
+
+   /** GL_EXT_transform_feedback */
+   GLuint MaxTransformFeedbackSeparateAttribs;
+   GLuint MaxTransformFeedbackSeparateComponents;
+   GLuint MaxTransformFeedbackInterleavedComponents;
+
+   /** GL_EXT_gpu_shader4 */
+   GLint MinProgramTexelOffset, MaxProgramTexelOffset;
+
+   /* GL_EXT_framebuffer_sRGB */
+   GLboolean sRGBCapable; /* can enable sRGB blend/update on FBOs */
+};
+
+
+/**
+ * Enable flag for each OpenGL extension.  Different device drivers will
+ * enable different extensions at runtime.
+ */
+struct gl_extensions
+{
+   GLboolean dummy;  /* don't remove this! */
+   GLboolean dummy_true;  /* Set true by _mesa_init_extensions(). */
+   GLboolean dummy_false; /* Set false by _mesa_init_extensions(). */
+   GLboolean ARB_ES2_compatibility;
+   GLboolean ARB_blend_func_extended;
+   GLboolean ARB_copy_buffer;
+   GLboolean ARB_depth_buffer_float;
+   GLboolean ARB_depth_clamp;
+   GLboolean ARB_depth_texture;
+   GLboolean ARB_draw_buffers;
+   GLboolean ARB_draw_buffers_blend;
+   GLboolean ARB_draw_elements_base_vertex;
+   GLboolean ARB_draw_instanced;
+   GLboolean ARB_fragment_coord_conventions;
+   GLboolean ARB_fragment_program;
+   GLboolean ARB_fragment_program_shadow;
+   GLboolean ARB_fragment_shader;
+   GLboolean ARB_framebuffer_object;
+   GLboolean ARB_explicit_attrib_location;
+   GLboolean ARB_geometry_shader4;
+   GLboolean ARB_half_float_pixel;
+   GLboolean ARB_half_float_vertex;
+   GLboolean ARB_instanced_arrays;
+   GLboolean ARB_map_buffer_range;
+   GLboolean ARB_multisample;
+   GLboolean ARB_multitexture;
+   GLboolean ARB_occlusion_query;
+   GLboolean ARB_occlusion_query2;
+   GLboolean ARB_point_sprite;
+   GLboolean ARB_sampler_objects;
+   GLboolean ARB_seamless_cube_map;
+   GLboolean ARB_shader_objects;
+   GLboolean ARB_shader_stencil_export;
+   GLboolean ARB_shading_language_100;
+   GLboolean ARB_shadow;
+   GLboolean ARB_shadow_ambient;
+   GLboolean ARB_sync;
+   GLboolean ARB_texture_border_clamp;
+   GLboolean ARB_texture_buffer_object;
+   GLboolean ARB_texture_compression;
+   GLboolean ARB_texture_compression_rgtc;
+   GLboolean ARB_texture_cube_map;
+   GLboolean ARB_texture_env_combine;
+   GLboolean ARB_texture_env_crossbar;
+   GLboolean ARB_texture_env_dot3;
+   GLboolean ARB_texture_float;
+   GLboolean ARB_texture_mirrored_repeat;
+   GLboolean ARB_texture_multisample;
+   GLboolean ARB_texture_non_power_of_two;
+   GLboolean ARB_texture_rg;
+   GLboolean ARB_texture_rgb10_a2ui;
+   GLboolean ARB_timer_query;
+   GLboolean ARB_transform_feedback2;
+   GLboolean ARB_transpose_matrix;
+   GLboolean ARB_uniform_buffer_object;
+   GLboolean ARB_vertex_array_object;
+   GLboolean ARB_vertex_buffer_object;
+   GLboolean ARB_vertex_program;
+   GLboolean ARB_vertex_shader;
+   GLboolean ARB_vertex_type_2_10_10_10_rev;
+   GLboolean ARB_window_pos;
+   GLboolean EXT_abgr;
+   GLboolean EXT_bgra;
+   GLboolean EXT_blend_color;
+   GLboolean EXT_blend_equation_separate;
+   GLboolean EXT_blend_func_separate;
+   GLboolean EXT_blend_logic_op;
+   GLboolean EXT_blend_minmax;
+   GLboolean EXT_blend_subtract;
+   GLboolean EXT_clip_volume_hint;
+   GLboolean EXT_compiled_vertex_array;
+   GLboolean EXT_copy_texture;
+   GLboolean EXT_depth_bounds_test;
+   GLboolean EXT_draw_buffers2;
+   GLboolean EXT_draw_range_elements;
+   GLboolean EXT_fog_coord;
+   GLboolean EXT_framebuffer_blit;
+   GLboolean EXT_framebuffer_multisample;
+   GLboolean EXT_framebuffer_object;
+   GLboolean EXT_framebuffer_sRGB;
+   GLboolean EXT_gpu_program_parameters;
+   GLboolean EXT_gpu_shader4;
+   GLboolean EXT_multi_draw_arrays;
+   GLboolean EXT_paletted_texture;
+   GLboolean EXT_packed_depth_stencil;
+   GLboolean EXT_packed_float;
+   GLboolean EXT_packed_pixels;
+   GLboolean EXT_pixel_buffer_object;
+   GLboolean EXT_point_parameters;
+   GLboolean EXT_polygon_offset;
+   GLboolean EXT_provoking_vertex;
+   GLboolean EXT_rescale_normal;
+   GLboolean EXT_shadow_funcs;
+   GLboolean EXT_secondary_color;
+   GLboolean EXT_separate_shader_objects;
+   GLboolean EXT_separate_specular_color;
+   GLboolean EXT_shared_texture_palette;
+   GLboolean EXT_stencil_wrap;
+   GLboolean EXT_stencil_two_side;
+   GLboolean EXT_subtexture;
+   GLboolean EXT_texture;
+   GLboolean EXT_texture_object;
+   GLboolean EXT_texture3D;
+   GLboolean EXT_texture_array;
+   GLboolean EXT_texture_compression_s3tc;
+   GLboolean EXT_texture_env_add;
+   GLboolean EXT_texture_env_combine;
+   GLboolean EXT_texture_env_dot3;
+   GLboolean EXT_texture_filter_anisotropic;
+   GLboolean EXT_texture_integer;
+   GLboolean EXT_texture_lod_bias;
+   GLboolean EXT_texture_mirror_clamp;
+   GLboolean EXT_texture_shared_exponent;
+   GLboolean EXT_texture_sRGB;
+   GLboolean EXT_texture_sRGB_decode;
+   GLboolean EXT_texture_swizzle;
+   GLboolean EXT_transform_feedback;
+   GLboolean EXT_timer_query;
+   GLboolean EXT_vertex_array;
+   GLboolean EXT_vertex_array_bgra;
+   GLboolean EXT_vertex_array_set;
+   GLboolean OES_standard_derivatives;
+   /* vendor extensions */
+   GLboolean AMD_conservative_depth;
+   GLboolean APPLE_client_storage;
+   GLboolean APPLE_packed_pixels;
+   GLboolean APPLE_vertex_array_object;
+   GLboolean APPLE_object_purgeable;
+   GLboolean ATI_envmap_bumpmap;
+   GLboolean ATI_texture_mirror_once;
+   GLboolean ATI_texture_env_combine3;
+   GLboolean ATI_fragment_shader;
+   GLboolean ATI_separate_stencil;
+   GLboolean IBM_rasterpos_clip;
+   GLboolean IBM_multimode_draw_arrays;
+   GLboolean MESA_pack_invert;
+   GLboolean MESA_resize_buffers;
+   GLboolean MESA_ycbcr_texture;
+   GLboolean MESA_texture_array;
+   GLboolean MESA_texture_signed_rgba;
+   GLboolean NV_blend_square;
+   GLboolean NV_conditional_render;
+   GLboolean NV_fragment_program;
+   GLboolean NV_fragment_program_option;
+   GLboolean NV_light_max_exponent;
+   GLboolean NV_point_sprite;
+   GLboolean NV_primitive_restart;
+   GLboolean NV_texgen_reflection;
+   GLboolean NV_texture_env_combine4;
+   GLboolean NV_texture_rectangle;
+   GLboolean NV_vertex_program;
+   GLboolean NV_vertex_program1_1;
+   GLboolean OES_read_format;
+   GLboolean SGI_texture_color_table;
+   GLboolean SGIS_generate_mipmap;
+   GLboolean SGIS_texture_edge_clamp;
+   GLboolean SGIS_texture_lod;
+   GLboolean TDFX_texture_compression_FXT1;
+   GLboolean S3_s3tc;
+   GLboolean OES_EGL_image;
+   GLboolean OES_draw_texture;
+   GLboolean EXT_texture_format_BGRA8888;
+   GLboolean extension_sentinel;
+   /** The extension string */
+   const GLubyte *String;
+   /** Number of supported extensions */
+   GLuint Count;
+};
+
+
+/**
+ * A stack of matrices (projection, modelview, color, texture, etc).
+ */
+struct gl_matrix_stack
+{
+   GLmatrix *Top;      /**< points into Stack */
+   GLmatrix *Stack;    /**< array [MaxDepth] of GLmatrix */
+   GLuint Depth;       /**< 0 <= Depth < MaxDepth */
+   GLuint MaxDepth;    /**< size of Stack[] array */
+   GLuint DirtyFlag;   /**< _NEW_MODELVIEW or _NEW_PROJECTION, for example */
+};
+
+
+/**
+ * \name Bits for image transfer operations 
+ * \sa __struct gl_contextRec::ImageTransferState.
+ */
+/*@{*/
+#define IMAGE_SCALE_BIAS_BIT                      0x1
+#define IMAGE_SHIFT_OFFSET_BIT                    0x2
+#define IMAGE_MAP_COLOR_BIT                       0x4
+#define IMAGE_CLAMP_BIT                           0x800
+
+
+/** Pixel Transfer ops */
+#define IMAGE_BITS (IMAGE_SCALE_BIAS_BIT |			\
+		    IMAGE_SHIFT_OFFSET_BIT |			\
+		    IMAGE_MAP_COLOR_BIT)
+
+/**
+ * \name Bits to indicate what state has changed.  
+ */
+/*@{*/
+#define _NEW_MODELVIEW         (1 << 0)   /**< gl_context::ModelView */
+#define _NEW_PROJECTION        (1 << 1)   /**< gl_context::Projection */
+#define _NEW_TEXTURE_MATRIX    (1 << 2)   /**< gl_context::TextureMatrix */
+#define _NEW_COLOR             (1 << 3)   /**< gl_context::Color */
+#define _NEW_DEPTH             (1 << 4)   /**< gl_context::Depth */
+#define _NEW_EVAL              (1 << 5)   /**< gl_context::Eval, EvalMap */
+#define _NEW_FOG               (1 << 6)   /**< gl_context::Fog */
+#define _NEW_HINT              (1 << 7)   /**< gl_context::Hint */
+#define _NEW_LIGHT             (1 << 8)   /**< gl_context::Light */
+#define _NEW_LINE              (1 << 9)   /**< gl_context::Line */
+#define _NEW_PIXEL             (1 << 10)  /**< gl_context::Pixel */
+#define _NEW_POINT             (1 << 11)  /**< gl_context::Point */
+#define _NEW_POLYGON           (1 << 12)  /**< gl_context::Polygon */
+#define _NEW_POLYGONSTIPPLE    (1 << 13)  /**< gl_context::PolygonStipple */
+#define _NEW_SCISSOR           (1 << 14)  /**< gl_context::Scissor */
+#define _NEW_STENCIL           (1 << 15)  /**< gl_context::Stencil */
+#define _NEW_TEXTURE           (1 << 16)  /**< gl_context::Texture */
+#define _NEW_TRANSFORM         (1 << 17)  /**< gl_context::Transform */
+#define _NEW_VIEWPORT          (1 << 18)  /**< gl_context::Viewport */
+#define _NEW_PACKUNPACK        (1 << 19)  /**< gl_context::Pack, Unpack */
+#define _NEW_ARRAY             (1 << 20)  /**< gl_context::Array */
+#define _NEW_RENDERMODE        (1 << 21)  /**< gl_context::RenderMode, etc */
+#define _NEW_BUFFERS           (1 << 22)  /**< gl_context::Visual, DrawBuffer, */
+#define _NEW_CURRENT_ATTRIB    (1 << 23)  /**< gl_context::Current */
+#define _NEW_MULTISAMPLE       (1 << 24)  /**< gl_context::Multisample */
+#define _NEW_TRACK_MATRIX      (1 << 25)  /**< gl_context::VertexProgram */
+#define _NEW_PROGRAM           (1 << 26)  /**< New program/shader state */
+#define _NEW_PROGRAM_CONSTANTS (1 << 27)
+#define _NEW_BUFFER_OBJECT     (1 << 28)
+#define _NEW_ALL ~0
+/*@}*/
+
+
+/**
+ * \name Bits to track array state changes 
+ *
+ * Also used to summarize array enabled.
+ */
+/*@{*/
+#define _NEW_ARRAY_VERTEX           VERT_BIT_POS
+#define _NEW_ARRAY_WEIGHT           VERT_BIT_WEIGHT
+#define _NEW_ARRAY_NORMAL           VERT_BIT_NORMAL
+#define _NEW_ARRAY_COLOR0           VERT_BIT_COLOR0
+#define _NEW_ARRAY_COLOR1           VERT_BIT_COLOR1
+#define _NEW_ARRAY_FOGCOORD         VERT_BIT_FOG
+#define _NEW_ARRAY_INDEX            VERT_BIT_COLOR_INDEX
+#define _NEW_ARRAY_EDGEFLAG         VERT_BIT_EDGEFLAG
+#define _NEW_ARRAY_POINT_SIZE       VERT_BIT_COLOR_INDEX  /* aliased */
+#define _NEW_ARRAY_TEXCOORD_0       VERT_BIT_TEX0
+#define _NEW_ARRAY_TEXCOORD_1       VERT_BIT_TEX1
+#define _NEW_ARRAY_TEXCOORD_2       VERT_BIT_TEX2
+#define _NEW_ARRAY_TEXCOORD_3       VERT_BIT_TEX3
+#define _NEW_ARRAY_TEXCOORD_4       VERT_BIT_TEX4
+#define _NEW_ARRAY_TEXCOORD_5       VERT_BIT_TEX5
+#define _NEW_ARRAY_TEXCOORD_6       VERT_BIT_TEX6
+#define _NEW_ARRAY_TEXCOORD_7       VERT_BIT_TEX7
+#define _NEW_ARRAY_ATTRIB_0         VERT_BIT_GENERIC0  /* start at bit 16 */
+#define _NEW_ARRAY_ALL              0xffffffff
+
+
+#define _NEW_ARRAY_TEXCOORD(i) (_NEW_ARRAY_TEXCOORD_0 << (i))
+#define _NEW_ARRAY_ATTRIB(i) (_NEW_ARRAY_ATTRIB_0 << (i))
+/*@}*/
+
+
+
+/**
+ * \name A bunch of flags that we think might be useful to drivers.
+ * 
+ * Set in the __struct gl_contextRec::_TriangleCaps bitfield.
+ */
+/*@{*/
+#define DD_FLATSHADE                0x1
+#define DD_SEPARATE_SPECULAR        0x2
+#define DD_TRI_CULL_FRONT_BACK      0x4 /* special case on some hw */
+#define DD_TRI_LIGHT_TWOSIDE        0x8
+#define DD_TRI_UNFILLED             0x10
+#define DD_TRI_SMOOTH               0x20
+#define DD_TRI_STIPPLE              0x40
+#define DD_TRI_OFFSET               0x80
+#define DD_LINE_SMOOTH              0x100
+#define DD_LINE_STIPPLE             0x200
+#define DD_POINT_SMOOTH             0x400
+#define DD_POINT_ATTEN              0x800
+#define DD_TRI_TWOSTENCIL           0x1000
+/*@}*/
+
+
+/**
+ * \name Define the state changes under which each of these bits might change
+ */
+/*@{*/
+#define _DD_NEW_FLATSHADE                _NEW_LIGHT
+#define _DD_NEW_SEPARATE_SPECULAR        (_NEW_LIGHT | _NEW_FOG | _NEW_PROGRAM)
+#define _DD_NEW_TRI_CULL_FRONT_BACK      _NEW_POLYGON
+#define _DD_NEW_TRI_LIGHT_TWOSIDE        _NEW_LIGHT
+#define _DD_NEW_TRI_UNFILLED             _NEW_POLYGON
+#define _DD_NEW_TRI_SMOOTH               _NEW_POLYGON
+#define _DD_NEW_TRI_STIPPLE              _NEW_POLYGON
+#define _DD_NEW_TRI_OFFSET               _NEW_POLYGON
+#define _DD_NEW_LINE_SMOOTH              _NEW_LINE
+#define _DD_NEW_LINE_STIPPLE             _NEW_LINE
+#define _DD_NEW_LINE_WIDTH               _NEW_LINE
+#define _DD_NEW_POINT_SMOOTH             _NEW_POINT
+#define _DD_NEW_POINT_SIZE               _NEW_POINT
+#define _DD_NEW_POINT_ATTEN              _NEW_POINT
+/*@}*/
+
+
+/**
+ * Composite state flags
+ */
+/*@{*/
+#define _MESA_NEW_NEED_EYE_COORDS         (_NEW_LIGHT |		\
+                                           _NEW_TEXTURE |	\
+                                           _NEW_POINT |		\
+                                           _NEW_PROGRAM |	\
+                                           _NEW_MODELVIEW)
+
+#define _MESA_NEW_NEED_NORMALS            (_NEW_LIGHT |		\
+                                           _NEW_TEXTURE)
+
+#define _MESA_NEW_TRANSFER_STATE          (_NEW_PIXEL)
+/*@}*/
+
+
+
+
+/* This has to be included here. */
+#include "dd.h"
+
+
+/**
+ * Display list flags.
+ * Strictly this is a tnl-private concept, but it doesn't seem
+ * worthwhile adding a tnl private structure just to hold this one bit
+ * of information:
+ */
+#define DLIST_DANGLING_REFS     0x1 
+
+
+/** Opaque declaration of display list payload data type */
+union gl_dlist_node;
+
+
+/**
+ * Provide a location where information about a display list can be
+ * collected.  Could be extended with driverPrivate structures,
+ * etc. in the future.
+ */
+struct gl_display_list
+{
+   GLuint Name;
+   GLbitfield Flags;  /**< DLIST_x flags */
+   /** The dlist commands are in a linked list of nodes */
+   union gl_dlist_node *Head;
+};
+
+
+/**
+ * State used during display list compilation and execution.
+ */
+struct gl_dlist_state
+{
+   GLuint CallDepth;		/**< Current recursion calling depth */
+
+   struct gl_display_list *CurrentList; /**< List currently being compiled */
+   union gl_dlist_node *CurrentBlock; /**< Pointer to current block of nodes */
+   GLuint CurrentPos;		/**< Index into current block of nodes */
+
+   GLvertexformat ListVtxfmt;
+
+   GLubyte ActiveAttribSize[VERT_ATTRIB_MAX];
+   GLfloat CurrentAttrib[VERT_ATTRIB_MAX][4];
+   
+   GLubyte ActiveMaterialSize[MAT_ATTRIB_MAX];
+   GLfloat CurrentMaterial[MAT_ATTRIB_MAX][4];
+
+   GLubyte ActiveIndex;
+   GLfloat CurrentIndex;
+   
+   GLubyte ActiveEdgeFlag;
+   GLboolean CurrentEdgeFlag;
+
+   struct {
+      /* State known to have been set by the currently-compiling display
+       * list.  Used to eliminate some redundant state changes.
+       */
+      GLenum ShadeModel;
+   } Current;
+};
+
+
+/**
+ * Enum for the OpenGL APIs we know about and may support.
+ */
+typedef enum
+{
+   API_OPENGL,
+   API_OPENGLES,
+   API_OPENGLES2
+} gl_api;
+
+
+/**
+ * Mesa rendering context.
+ *
+ * This is the central context data structure for Mesa.  Almost all
+ * OpenGL state is contained in this structure.
+ * Think of this as a base class from which device drivers will derive
+ * sub classes.
+ *
+ * The struct gl_context typedef names this structure.
+ */
+struct gl_context
+{
+   /** State possibly shared with other contexts in the address space */
+   struct gl_shared_state *Shared;
+
+   /** \name API function pointer tables */
+   /*@{*/
+   gl_api API;
+   struct _glapi_table *Save;	/**< Display list save functions */
+   struct _glapi_table *Exec;	/**< Execute functions */
+   struct _glapi_table *CurrentDispatch;  /**< == Save or Exec !! */
+   /*@}*/
+
+   struct gl_config Visual;
+   struct gl_framebuffer *DrawBuffer;	/**< buffer for writing */
+   struct gl_framebuffer *ReadBuffer;	/**< buffer for reading */
+   struct gl_framebuffer *WinSysDrawBuffer;  /**< set with MakeCurrent */
+   struct gl_framebuffer *WinSysReadBuffer;  /**< set with MakeCurrent */
+
+   /**
+    * Device driver function pointer table
+    */
+   struct dd_function_table Driver;
+
+   void *DriverCtx;	/**< Points to device driver context/state */
+
+   /** Core/Driver constants */
+   struct gl_constants Const;
+
+   /** \name The various 4x4 matrix stacks */
+   /*@{*/
+   struct gl_matrix_stack ModelviewMatrixStack;
+   struct gl_matrix_stack ProjectionMatrixStack;
+   struct gl_matrix_stack TextureMatrixStack[MAX_TEXTURE_UNITS];
+   struct gl_matrix_stack ProgramMatrixStack[MAX_PROGRAM_MATRICES];
+   struct gl_matrix_stack *CurrentStack; /**< Points to one of the above stacks */
+   /*@}*/
+
+   /** Combined modelview and projection matrix */
+   GLmatrix _ModelProjectMatrix;
+
+   /** \name Display lists */
+   struct gl_dlist_state ListState;
+
+   GLboolean ExecuteFlag;	/**< Execute GL commands? */
+   GLboolean CompileFlag;	/**< Compile GL commands into display list? */
+
+   /** Extension information */
+   struct gl_extensions Extensions;
+
+   /** Version info */
+   GLuint VersionMajor, VersionMinor;
+   char *VersionString;
+
+   /** \name State attribute stack (for glPush/PopAttrib) */
+   /*@{*/
+   GLuint AttribStackDepth;
+   struct gl_attrib_node *AttribStack[MAX_ATTRIB_STACK_DEPTH];
+   /*@}*/
+
+   /** \name Renderer attribute groups
+    * 
+    * We define a struct for each attribute group to make pushing and popping
+    * attributes easy.  Also it's a good organization.
+    */
+   /*@{*/
+   struct gl_accum_attrib	Accum;		/**< Accum buffer attributes */
+   struct gl_colorbuffer_attrib	Color;		/**< Color buffer attributes */
+   struct gl_current_attrib	Current;	/**< Current attributes */
+   struct gl_depthbuffer_attrib	Depth;		/**< Depth buffer attributes */
+   struct gl_eval_attrib	Eval;		/**< Eval attributes */
+   struct gl_fog_attrib		Fog;		/**< Fog attributes */
+   struct gl_hint_attrib	Hint;		/**< Hint attributes */
+   struct gl_light_attrib	Light;		/**< Light attributes */
+   struct gl_line_attrib	Line;		/**< Line attributes */
+   struct gl_list_attrib	List;		/**< List attributes */
+   struct gl_multisample_attrib Multisample;
+   struct gl_pixel_attrib	Pixel;		/**< Pixel attributes */
+   struct gl_point_attrib	Point;		/**< Point attributes */
+   struct gl_polygon_attrib	Polygon;	/**< Polygon attributes */
+   GLuint PolygonStipple[32];			/**< Polygon stipple */
+   struct gl_scissor_attrib	Scissor;	/**< Scissor attributes */
+   struct gl_stencil_attrib	Stencil;	/**< Stencil buffer attributes */
+   struct gl_texture_attrib	Texture;	/**< Texture attributes */
+   struct gl_transform_attrib	Transform;	/**< Transformation attributes */
+   struct gl_viewport_attrib	Viewport;	/**< Viewport attributes */
+   /*@}*/
+
+   /** \name Client attribute stack */
+   /*@{*/
+   GLuint ClientAttribStackDepth;
+   struct gl_attrib_node *ClientAttribStack[MAX_CLIENT_ATTRIB_STACK_DEPTH];
+   /*@}*/
+
+   /** \name Client attribute groups */
+   /*@{*/
+   struct gl_array_attrib	Array;	/**< Vertex arrays */
+   struct gl_pixelstore_attrib	Pack;	/**< Pixel packing */
+   struct gl_pixelstore_attrib	Unpack;	/**< Pixel unpacking */
+   struct gl_pixelstore_attrib	DefaultPacking;	/**< Default params */
+   /*@}*/
+
+   /** \name Other assorted state (not pushed/popped on attribute stack) */
+   /*@{*/
+   struct gl_pixelmaps          PixelMaps;
+
+   struct gl_evaluators EvalMap;   /**< All evaluators */
+   struct gl_feedback   Feedback;  /**< Feedback */
+   struct gl_selection  Select;    /**< Selection */
+
+   struct gl_program_state Program;  /**< general program state */
+   struct gl_vertex_program_state VertexProgram;
+   struct gl_fragment_program_state FragmentProgram;
+   struct gl_geometry_program_state GeometryProgram;
+   struct gl_ati_fragment_shader_state ATIFragmentShader;
+
+   struct gl_shader_state Shader; /**< GLSL shader object state */
+   struct gl_shader_compiler_options ShaderCompilerOptions[MESA_SHADER_TYPES];
+
+   struct gl_query_state Query;  /**< occlusion, timer queries */
+
+   struct gl_transform_feedback TransformFeedback;
+
+   struct gl_buffer_object *CopyReadBuffer; /**< GL_ARB_copy_buffer */
+   struct gl_buffer_object *CopyWriteBuffer; /**< GL_ARB_copy_buffer */
+   /*@}*/
+
+   struct gl_meta_state *Meta;  /**< for "meta" operations */
+
+   /* GL_EXT_framebuffer_object */
+   struct gl_renderbuffer *CurrentRenderbuffer;
+
+   GLenum ErrorValue;        /**< Last error code */
+
+   /**
+    * Recognize and silence repeated error debug messages in buggy apps.
+    */
+   const char *ErrorDebugFmtString;
+   GLuint ErrorDebugCount;
+
+   GLenum RenderMode;        /**< either GL_RENDER, GL_SELECT, GL_FEEDBACK */
+   GLbitfield NewState;      /**< bitwise-or of _NEW_* flags */
+
+   GLboolean ViewportInitialized;  /**< has viewport size been initialized? */
+
+   GLbitfield varying_vp_inputs;  /**< mask of VERT_BIT_* flags */
+
+   /** \name Derived state */
+   /*@{*/
+   /** Bitwise-or of DD_* flags.  Note that this bitfield may be used before
+    * state validation so they need to always be current.
+    */
+   GLbitfield _TriangleCaps;
+   GLbitfield _ImageTransferState;/**< bitwise-or of IMAGE_*_BIT flags */
+   GLfloat _EyeZDir[3];
+   GLfloat _ModelViewInvScale;
+   GLboolean _NeedEyeCoords;
+   GLboolean _ForceEyeCoords; 
+
+   GLuint TextureStateTimestamp; /**< detect changes to shared state */
+
+   struct gl_shine_tab *_ShineTable[2]; /**< Active shine tables */
+   struct gl_shine_tab *_ShineTabList;  /**< MRU list of inactive shine tables */
+   /**@}*/
+
+   struct gl_list_extensions *ListExt; /**< driver dlist extensions */
+
+   /** \name For debugging/development only */
+   /*@{*/
+   GLboolean FirstTimeCurrent;
+   /*@}*/
+
+   /** Dither disable via MESA_NO_DITHER env var */
+   GLboolean NoDither;
+
+   /** software compression/decompression supported or not */
+   GLboolean Mesa_DXTn;
+
+   GLboolean TextureFormatSupported[MESA_FORMAT_COUNT];
+
+   /** 
+    * Use dp4 (rather than mul/mad) instructions for position
+    * transformation?
+    */
+   GLboolean mvp_with_dp4;
+
+   /**
+    * \name Hooks for module contexts.  
+    *
+    * These will eventually live in the driver or elsewhere.
+    */
+   /*@{*/
+   void *swrast_context;
+   void *swsetup_context;
+   void *swtnl_context;
+   void *swtnl_im;
+   struct st_context *st;
+   void *aelt_context;
+   /*@}*/
+};
+
+
+#ifdef DEBUG
+extern int MESA_VERBOSE;
+extern int MESA_DEBUG_FLAGS;
+# define MESA_FUNCTION __FUNCTION__
+#else
+# define MESA_VERBOSE 0
+# define MESA_DEBUG_FLAGS 0
+# define MESA_FUNCTION "a function"
+# ifndef NDEBUG
+#  define NDEBUG
+# endif
+#endif
+
+
+/** The MESA_VERBOSE var is a bitmask of these flags */
+enum _verbose
+{
+   VERBOSE_VARRAY		= 0x0001,
+   VERBOSE_TEXTURE		= 0x0002,
+   VERBOSE_MATERIAL		= 0x0004,
+   VERBOSE_PIPELINE		= 0x0008,
+   VERBOSE_DRIVER		= 0x0010,
+   VERBOSE_STATE		= 0x0020,
+   VERBOSE_API			= 0x0040,
+   VERBOSE_DISPLAY_LIST		= 0x0100,
+   VERBOSE_LIGHTING		= 0x0200,
+   VERBOSE_PRIMS		= 0x0400,
+   VERBOSE_VERTS		= 0x0800,
+   VERBOSE_DISASSEM		= 0x1000,
+   VERBOSE_DRAW                 = 0x2000,
+   VERBOSE_SWAPBUFFERS          = 0x4000
+};
+
+
+/** The MESA_DEBUG_FLAGS var is a bitmask of these flags */
+enum _debug
+{
+   DEBUG_ALWAYS_FLUSH		= 0x1
+};
+
+
+
+#endif /* MTYPES_H */
diff --git a/mesalib/src/mesa/main/state.c b/mesalib/src/mesa/main/state.c
index c07e2c380..502c42929 100644
--- a/mesalib/src/mesa/main/state.c
+++ b/mesalib/src/mesa/main/state.c
@@ -1,732 +1,734 @@
-/*
- * Mesa 3-D graphics library
- * Version:  7.3
- *
- * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-/**
- * \file state.c
- * State management.
- * 
- * This file manages recalculation of derived values in struct gl_context.
- */
-
-
-#include "glheader.h"
-#include "mtypes.h"
-#include "context.h"
-#include "debug.h"
-#include "macros.h"
-#include "ffvertex_prog.h"
-#include "framebuffer.h"
-#include "light.h"
-#include "matrix.h"
-#include "pixel.h"
-#include "program/program.h"
-#include "program/prog_parameter.h"
-#include "state.h"
-#include "stencil.h"
-#include "texenvprogram.h"
-#include "texobj.h"
-#include "texstate.h"
-
-
-static void
-update_separate_specular(struct gl_context *ctx)
-{
-   if (NEED_SECONDARY_COLOR(ctx))
-      ctx->_TriangleCaps |= DD_SEPARATE_SPECULAR;
-   else
-      ctx->_TriangleCaps &= ~DD_SEPARATE_SPECULAR;
-}
-
-
-/**
- * Compute the index of the last array element that can be safely accessed
- * in a vertex array.  We can really only do this when the array lives in
- * a VBO.
- * The array->_MaxElement field will be updated.
- * Later in glDrawArrays/Elements/etc we can do some bounds checking.
- */
-static void
-compute_max_element(struct gl_client_array *array)
-{
-   assert(array->Enabled);
-   if (array->BufferObj->Name) {
-      GLsizeiptrARB offset = (GLsizeiptrARB) array->Ptr;
-      GLsizeiptrARB obj_size = (GLsizeiptrARB) array->BufferObj->Size;
-
-      if (offset < obj_size) {
-	 array->_MaxElement = (obj_size - offset +
-			       array->StrideB -
-			       array->_ElementSize) / array->StrideB;
-      } else {
-	 array->_MaxElement = 0;
-      }
-   }
-   else {
-      /* user-space array, no idea how big it is */
-      array->_MaxElement = 2 * 1000 * 1000 * 1000; /* just a big number */
-   }
-}
-
-
-/**
- * Helper for update_arrays().
- * \return  min(current min, array->_MaxElement).
- */
-static GLuint
-update_min(GLuint min, struct gl_client_array *array)
-{
-   compute_max_element(array);
-   return MIN2(min, array->_MaxElement);
-}
-
-
-/**
- * Update ctx->Array._MaxElement (the max legal index into all enabled arrays).
- * Need to do this upon new array state or new buffer object state.
- */
-static void
-update_arrays( struct gl_context *ctx )
-{
-   struct gl_array_object *arrayObj = ctx->Array.ArrayObj;
-   GLuint i, min = ~0;
-
-   /* find min of _MaxElement values for all enabled arrays */
-
-   /* 0 */
-   if (ctx->VertexProgram._Current
-       && arrayObj->VertexAttrib[VERT_ATTRIB_POS].Enabled) {
-      min = update_min(min, &arrayObj->VertexAttrib[VERT_ATTRIB_POS]);
-   }
-   else if (arrayObj->Vertex.Enabled) {
-      min = update_min(min, &arrayObj->Vertex);
-   }
-
-   /* 1 */
-   if (ctx->VertexProgram._Enabled
-       && arrayObj->VertexAttrib[VERT_ATTRIB_WEIGHT].Enabled) {
-      min = update_min(min, &arrayObj->VertexAttrib[VERT_ATTRIB_WEIGHT]);
-   }
-   /* no conventional vertex weight array */
-
-   /* 2 */
-   if (ctx->VertexProgram._Enabled
-       && arrayObj->VertexAttrib[VERT_ATTRIB_NORMAL].Enabled) {
-      min = update_min(min, &arrayObj->VertexAttrib[VERT_ATTRIB_NORMAL]);
-   }
-   else if (arrayObj->Normal.Enabled) {
-      min = update_min(min, &arrayObj->Normal);
-   }
-
-   /* 3 */
-   if (ctx->VertexProgram._Enabled
-       && arrayObj->VertexAttrib[VERT_ATTRIB_COLOR0].Enabled) {
-      min = update_min(min, &arrayObj->VertexAttrib[VERT_ATTRIB_COLOR0]);
-   }
-   else if (arrayObj->Color.Enabled) {
-      min = update_min(min, &arrayObj->Color);
-   }
-
-   /* 4 */
-   if (ctx->VertexProgram._Enabled
-       && arrayObj->VertexAttrib[VERT_ATTRIB_COLOR1].Enabled) {
-      min = update_min(min, &arrayObj->VertexAttrib[VERT_ATTRIB_COLOR1]);
-   }
-   else if (arrayObj->SecondaryColor.Enabled) {
-      min = update_min(min, &arrayObj->SecondaryColor);
-   }
-
-   /* 5 */
-   if (ctx->VertexProgram._Enabled
-       && arrayObj->VertexAttrib[VERT_ATTRIB_FOG].Enabled) {
-      min = update_min(min, &arrayObj->VertexAttrib[VERT_ATTRIB_FOG]);
-   }
-   else if (arrayObj->FogCoord.Enabled) {
-      min = update_min(min, &arrayObj->FogCoord);
-   }
-
-   /* 6 */
-   if (ctx->VertexProgram._Enabled
-       && arrayObj->VertexAttrib[VERT_ATTRIB_COLOR_INDEX].Enabled) {
-      min = update_min(min, &arrayObj->VertexAttrib[VERT_ATTRIB_COLOR_INDEX]);
-   }
-   else if (arrayObj->Index.Enabled) {
-      min = update_min(min, &arrayObj->Index);
-   }
-
-   /* 7 */
-   if (ctx->VertexProgram._Enabled
-       && arrayObj->VertexAttrib[VERT_ATTRIB_EDGEFLAG].Enabled) {
-      min = update_min(min, &arrayObj->VertexAttrib[VERT_ATTRIB_EDGEFLAG]);
-   }
-
-   /* 8..15 */
-   for (i = VERT_ATTRIB_TEX0; i <= VERT_ATTRIB_TEX7; i++) {
-      if (ctx->VertexProgram._Enabled
-          && arrayObj->VertexAttrib[i].Enabled) {
-         min = update_min(min, &arrayObj->VertexAttrib[i]);
-      }
-      else if (i - VERT_ATTRIB_TEX0 < ctx->Const.MaxTextureCoordUnits
-               && arrayObj->TexCoord[i - VERT_ATTRIB_TEX0].Enabled) {
-         min = update_min(min, &arrayObj->TexCoord[i - VERT_ATTRIB_TEX0]);
-      }
-   }
-
-   /* 16..31 */
-   if (ctx->VertexProgram._Current) {
-      for (i = 0; i < Elements(arrayObj->VertexAttrib); i++) {
-         if (arrayObj->VertexAttrib[i].Enabled) {
-            min = update_min(min, &arrayObj->VertexAttrib[i]);
-         }
-      }
-   }
-
-   if (arrayObj->EdgeFlag.Enabled) {
-      min = update_min(min, &arrayObj->EdgeFlag);
-   }
-
-   /* _MaxElement is one past the last legal array element */
-   arrayObj->_MaxElement = min;
-}
-
-
-/**
- * Update the following fields:
- *   ctx->VertexProgram._Enabled
- *   ctx->FragmentProgram._Enabled
- *   ctx->ATIFragmentShader._Enabled
- * This needs to be done before texture state validation.
- */
-static void
-update_program_enables(struct gl_context *ctx)
-{
-   /* These _Enabled flags indicate if the program is enabled AND valid. */
-   ctx->VertexProgram._Enabled = ctx->VertexProgram.Enabled
-      && ctx->VertexProgram.Current->Base.Instructions;
-   ctx->FragmentProgram._Enabled = ctx->FragmentProgram.Enabled
-      && ctx->FragmentProgram.Current->Base.Instructions;
-   ctx->ATIFragmentShader._Enabled = ctx->ATIFragmentShader.Enabled
-      && ctx->ATIFragmentShader.Current->Instructions[0];
-}
-
-
-/**
- * Update vertex/fragment program state.  In particular, update these fields:
- *   ctx->VertexProgram._Current
- *   ctx->VertexProgram._TnlProgram,
- * These point to the highest priority enabled vertex/fragment program or are
- * NULL if fixed-function processing is to be done.
- *
- * This function needs to be called after texture state validation in case
- * we're generating a fragment program from fixed-function texture state.
- *
- * \return bitfield which will indicate _NEW_PROGRAM state if a new vertex
- * or fragment program is being used.
- */
-static GLbitfield
-update_program(struct gl_context *ctx)
-{
-   const struct gl_shader_program *vsProg = ctx->Shader.CurrentVertexProgram;
-   const struct gl_shader_program *gsProg = ctx->Shader.CurrentGeometryProgram;
-   const struct gl_shader_program *fsProg = ctx->Shader.CurrentFragmentProgram;
-   const struct gl_vertex_program *prevVP = ctx->VertexProgram._Current;
-   const struct gl_fragment_program *prevFP = ctx->FragmentProgram._Current;
-   const struct gl_geometry_program *prevGP = ctx->GeometryProgram._Current;
-   GLbitfield new_state = 0x0;
-
-   /*
-    * Set the ctx->VertexProgram._Current and ctx->FragmentProgram._Current
-    * pointers to the programs that should be used for rendering.  If either
-    * is NULL, use fixed-function code paths.
-    *
-    * These programs may come from several sources.  The priority is as
-    * follows:
-    *   1. OpenGL 2.0/ARB vertex/fragment shaders
-    *   2. ARB/NV vertex/fragment programs
-    *   3. Programs derived from fixed-function state.
-    *
-    * Note: it's possible for a vertex shader to get used with a fragment
-    * program (and vice versa) here, but in practice that shouldn't ever
-    * come up, or matter.
-    */
-
-   if (fsProg && fsProg->LinkStatus && fsProg->FragmentProgram) {
-      /* Use shader programs */
-      _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._Current,
-                               fsProg->FragmentProgram);
-   }
-   else if (ctx->FragmentProgram._Enabled) {
-      /* use user-defined vertex program */
-      _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._Current,
-                               ctx->FragmentProgram.Current);
-   }
-   else if (ctx->FragmentProgram._MaintainTexEnvProgram) {
-      /* Use fragment program generated from fixed-function state.
-       */
-      _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._Current,
-                               _mesa_get_fixed_func_fragment_program(ctx));
-      _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._TexEnvProgram,
-                               ctx->FragmentProgram._Current);
-   }
-   else {
-      /* no fragment program */
-      _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._Current, NULL);
-   }
-
-   if (gsProg && gsProg->LinkStatus && gsProg->GeometryProgram) {
-      /* Use shader programs */
-      _mesa_reference_geomprog(ctx, &ctx->GeometryProgram._Current,
-                               gsProg->GeometryProgram);
-   } else {
-      /* no fragment program */
-      _mesa_reference_geomprog(ctx, &ctx->GeometryProgram._Current, NULL);
-   }
-
-   /* Examine vertex program after fragment program as
-    * _mesa_get_fixed_func_vertex_program() needs to know active
-    * fragprog inputs.
-    */
-   if (vsProg && vsProg->LinkStatus && vsProg->VertexProgram) {
-      /* Use shader programs */
-      _mesa_reference_vertprog(ctx, &ctx->VertexProgram._Current,
-                            vsProg->VertexProgram);
-   }
-   else if (ctx->VertexProgram._Enabled) {
-      /* use user-defined vertex program */
-      _mesa_reference_vertprog(ctx, &ctx->VertexProgram._Current,
-                               ctx->VertexProgram.Current);
-   }
-   else if (ctx->VertexProgram._MaintainTnlProgram) {
-      /* Use vertex program generated from fixed-function state.
-       */
-      _mesa_reference_vertprog(ctx, &ctx->VertexProgram._Current,
-                               _mesa_get_fixed_func_vertex_program(ctx));
-      _mesa_reference_vertprog(ctx, &ctx->VertexProgram._TnlProgram,
-                               ctx->VertexProgram._Current);
-   }
-   else {
-      /* no vertex program */
-      _mesa_reference_vertprog(ctx, &ctx->VertexProgram._Current, NULL);
-   }
-
-   /* Let the driver know what's happening:
-    */
-   if (ctx->FragmentProgram._Current != prevFP) {
-      new_state |= _NEW_PROGRAM;
-      if (ctx->Driver.BindProgram) {
-         ctx->Driver.BindProgram(ctx, GL_FRAGMENT_PROGRAM_ARB,
-                          (struct gl_program *) ctx->FragmentProgram._Current);
-      }
-   }
-
-   if (ctx->GeometryProgram._Current != prevGP) {
-      new_state |= _NEW_PROGRAM;
-      if (ctx->Driver.BindProgram) {
-         ctx->Driver.BindProgram(ctx, MESA_GEOMETRY_PROGRAM,
-                            (struct gl_program *) ctx->GeometryProgram._Current);
-      }
-   }
-
-   if (ctx->VertexProgram._Current != prevVP) {
-      new_state |= _NEW_PROGRAM;
-      if (ctx->Driver.BindProgram) {
-         ctx->Driver.BindProgram(ctx, GL_VERTEX_PROGRAM_ARB,
-                            (struct gl_program *) ctx->VertexProgram._Current);
-      }
-   }
-
-   return new_state;
-}
-
-
-/**
- * Examine shader constants and return either _NEW_PROGRAM_CONSTANTS or 0.
- */
-static GLbitfield
-update_program_constants(struct gl_context *ctx)
-{
-   GLbitfield new_state = 0x0;
-
-   if (ctx->FragmentProgram._Current) {
-      const struct gl_program_parameter_list *params =
-         ctx->FragmentProgram._Current->Base.Parameters;
-      if (params && params->StateFlags & ctx->NewState) {
-         new_state |= _NEW_PROGRAM_CONSTANTS;
-      }
-   }
-
-   if (ctx->GeometryProgram._Current) {
-      const struct gl_program_parameter_list *params =
-         ctx->GeometryProgram._Current->Base.Parameters;
-      /*FIXME: StateFlags is always 0 because we have unnamed constant
-       *       not state changes */
-      if (params /*&& params->StateFlags & ctx->NewState*/) {
-         new_state |= _NEW_PROGRAM_CONSTANTS;
-      }
-   }
-
-   if (ctx->VertexProgram._Current) {
-      const struct gl_program_parameter_list *params =
-         ctx->VertexProgram._Current->Base.Parameters;
-      if (params && params->StateFlags & ctx->NewState) {
-         new_state |= _NEW_PROGRAM_CONSTANTS;
-      }
-   }
-
-   return new_state;
-}
-
-
-
-
-static void
-update_viewport_matrix(struct gl_context *ctx)
-{
-   const GLfloat depthMax = ctx->DrawBuffer->_DepthMaxF;
-
-   ASSERT(depthMax > 0);
-
-   /* Compute scale and bias values. This is really driver-specific
-    * and should be maintained elsewhere if at all.
-    * NOTE: RasterPos uses this.
-    */
-   _math_matrix_viewport(&ctx->Viewport._WindowMap,
-                         ctx->Viewport.X, ctx->Viewport.Y,
-                         ctx->Viewport.Width, ctx->Viewport.Height,
-                         ctx->Viewport.Near, ctx->Viewport.Far,
-                         depthMax);
-}
-
-
-/**
- * Update derived multisample state.
- */
-static void
-update_multisample(struct gl_context *ctx)
-{
-   ctx->Multisample._Enabled = GL_FALSE;
-   if (ctx->Multisample.Enabled &&
-       ctx->DrawBuffer &&
-       ctx->DrawBuffer->Visual.sampleBuffers)
-      ctx->Multisample._Enabled = GL_TRUE;
-}
-
-
-/**
- * Update derived color/blend/logicop state.
- */
-static void
-update_color(struct gl_context *ctx)
-{
-   /* This is needed to support 1.1's RGB logic ops AND
-    * 1.0's blending logicops.
-    */
-   ctx->Color._LogicOpEnabled = RGBA_LOGICOP_ENABLED(ctx);
-}
-
-
-/*
- * Check polygon state and set DD_TRI_CULL_FRONT_BACK and/or DD_TRI_OFFSET
- * in ctx->_TriangleCaps if needed.
- */
-static void
-update_polygon(struct gl_context *ctx)
-{
-   ctx->_TriangleCaps &= ~(DD_TRI_CULL_FRONT_BACK | DD_TRI_OFFSET);
-
-   if (ctx->Polygon.CullFlag && ctx->Polygon.CullFaceMode == GL_FRONT_AND_BACK)
-      ctx->_TriangleCaps |= DD_TRI_CULL_FRONT_BACK;
-
-   if (   ctx->Polygon.OffsetPoint
-       || ctx->Polygon.OffsetLine
-       || ctx->Polygon.OffsetFill)
-      ctx->_TriangleCaps |= DD_TRI_OFFSET;
-}
-
-
-/**
- * Update the ctx->_TriangleCaps bitfield.
- * XXX that bitfield should really go away someday!
- * This function must be called after other update_*() functions since
- * there are dependencies on some other derived values.
- */
-#if 0
-static void
-update_tricaps(struct gl_context *ctx, GLbitfield new_state)
-{
-   ctx->_TriangleCaps = 0;
-
-   /*
-    * Points
-    */
-   if (1/*new_state & _NEW_POINT*/) {
-      if (ctx->Point.SmoothFlag)
-         ctx->_TriangleCaps |= DD_POINT_SMOOTH;
-      if (ctx->Point._Attenuated)
-         ctx->_TriangleCaps |= DD_POINT_ATTEN;
-   }
-
-   /*
-    * Lines
-    */
-   if (1/*new_state & _NEW_LINE*/) {
-      if (ctx->Line.SmoothFlag)
-         ctx->_TriangleCaps |= DD_LINE_SMOOTH;
-      if (ctx->Line.StippleFlag)
-         ctx->_TriangleCaps |= DD_LINE_STIPPLE;
-   }
-
-   /*
-    * Polygons
-    */
-   if (1/*new_state & _NEW_POLYGON*/) {
-      if (ctx->Polygon.SmoothFlag)
-         ctx->_TriangleCaps |= DD_TRI_SMOOTH;
-      if (ctx->Polygon.StippleFlag)
-         ctx->_TriangleCaps |= DD_TRI_STIPPLE;
-      if (ctx->Polygon.FrontMode != GL_FILL
-          || ctx->Polygon.BackMode != GL_FILL)
-         ctx->_TriangleCaps |= DD_TRI_UNFILLED;
-      if (ctx->Polygon.CullFlag
-          && ctx->Polygon.CullFaceMode == GL_FRONT_AND_BACK)
-         ctx->_TriangleCaps |= DD_TRI_CULL_FRONT_BACK;
-      if (ctx->Polygon.OffsetPoint ||
-          ctx->Polygon.OffsetLine ||
-          ctx->Polygon.OffsetFill)
-         ctx->_TriangleCaps |= DD_TRI_OFFSET;
-   }
-
-   /*
-    * Lighting and shading
-    */
-   if (ctx->Light.Enabled && ctx->Light.Model.TwoSide)
-      ctx->_TriangleCaps |= DD_TRI_LIGHT_TWOSIDE;
-   if (ctx->Light.ShadeModel == GL_FLAT)
-      ctx->_TriangleCaps |= DD_FLATSHADE;
-   if (NEED_SECONDARY_COLOR(ctx))
-      ctx->_TriangleCaps |= DD_SEPARATE_SPECULAR;
-
-   /*
-    * Stencil
-    */
-   if (ctx->Stencil._TestTwoSide)
-      ctx->_TriangleCaps |= DD_TRI_TWOSTENCIL;
-}
-#endif
-
-
-/**
- * Compute derived GL state.
- * If __struct gl_contextRec::NewState is non-zero then this function \b must
- * be called before rendering anything.
- *
- * Calls dd_function_table::UpdateState to perform any internal state
- * management necessary.
- * 
- * \sa _mesa_update_modelview_project(), _mesa_update_texture(),
- * _mesa_update_buffer_bounds(),
- * _mesa_update_lighting() and _mesa_update_tnl_spaces().
- */
-void
-_mesa_update_state_locked( struct gl_context *ctx )
-{
-   GLbitfield new_state = ctx->NewState;
-   GLbitfield prog_flags = _NEW_PROGRAM;
-   GLbitfield new_prog_state = 0x0;
-
-   if (new_state == _NEW_CURRENT_ATTRIB) 
-      goto out;
-
-   if (MESA_VERBOSE & VERBOSE_STATE)
-      _mesa_print_state("_mesa_update_state", new_state);
-
-   /* Determine which state flags effect vertex/fragment program state */
-   if (ctx->FragmentProgram._MaintainTexEnvProgram) {
-      prog_flags |= (_NEW_BUFFERS | _NEW_TEXTURE | _NEW_FOG |
-		     _NEW_ARRAY | _NEW_LIGHT | _NEW_POINT | _NEW_RENDERMODE |
-		     _NEW_PROGRAM);
-   }
-   if (ctx->VertexProgram._MaintainTnlProgram) {
-      prog_flags |= (_NEW_ARRAY | _NEW_TEXTURE | _NEW_TEXTURE_MATRIX |
-                     _NEW_TRANSFORM | _NEW_POINT |
-                     _NEW_FOG | _NEW_LIGHT |
-                     _MESA_NEW_NEED_EYE_COORDS);
-   }
-
-   /*
-    * Now update derived state info
-    */
-
-   if (new_state & prog_flags)
-      update_program_enables( ctx );
-
-   if (new_state & (_NEW_MODELVIEW|_NEW_PROJECTION))
-      _mesa_update_modelview_project( ctx, new_state );
-
-   if (new_state & (_NEW_PROGRAM|_NEW_TEXTURE|_NEW_TEXTURE_MATRIX))
-      _mesa_update_texture( ctx, new_state );
-
-   if (new_state & _NEW_BUFFERS)
-      _mesa_update_framebuffer(ctx);
-
-   if (new_state & (_NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT))
-      _mesa_update_draw_buffer_bounds( ctx );
-
-   if (new_state & _NEW_POLYGON)
-      update_polygon( ctx );
-
-   if (new_state & _NEW_LIGHT)
-      _mesa_update_lighting( ctx );
-
-   if (new_state & (_NEW_STENCIL | _NEW_BUFFERS))
-      _mesa_update_stencil( ctx );
-
-   if (new_state & _MESA_NEW_TRANSFER_STATE)
-      _mesa_update_pixel( ctx, new_state );
-
-   if (new_state & _DD_NEW_SEPARATE_SPECULAR)
-      update_separate_specular( ctx );
-
-   if (new_state & (_NEW_BUFFERS | _NEW_VIEWPORT))
-      update_viewport_matrix(ctx);
-
-   if (new_state & _NEW_MULTISAMPLE)
-      update_multisample( ctx );
-
-   if (new_state & _NEW_COLOR)
-      update_color( ctx );
-
-#if 0
-   if (new_state & (_NEW_POINT | _NEW_LINE | _NEW_POLYGON | _NEW_LIGHT
-                    | _NEW_STENCIL | _DD_NEW_SEPARATE_SPECULAR))
-      update_tricaps( ctx, new_state );
-#endif
-
-   /* ctx->_NeedEyeCoords is now up to date.
-    *
-    * If the truth value of this variable has changed, update for the
-    * new lighting space and recompute the positions of lights and the
-    * normal transform.
-    *
-    * If the lighting space hasn't changed, may still need to recompute
-    * light positions & normal transforms for other reasons.
-    */
-   if (new_state & _MESA_NEW_NEED_EYE_COORDS) 
-      _mesa_update_tnl_spaces( ctx, new_state );
-
-   if (new_state & prog_flags) {
-      /* When we generate programs from fixed-function vertex/fragment state
-       * this call may generate/bind a new program.  If so, we need to
-       * propogate the _NEW_PROGRAM flag to the driver.
-       */
-      new_prog_state |= update_program( ctx );
-   }
-
-   if (new_state & (_NEW_ARRAY | _NEW_PROGRAM | _NEW_BUFFER_OBJECT))
-      update_arrays( ctx );
-
- out:
-   new_prog_state |= update_program_constants(ctx);
-
-   /*
-    * Give the driver a chance to act upon the new_state flags.
-    * The driver might plug in different span functions, for example.
-    * Also, this is where the driver can invalidate the state of any
-    * active modules (such as swrast_setup, swrast, tnl, etc).
-    *
-    * Set ctx->NewState to zero to avoid recursion if
-    * Driver.UpdateState() has to call FLUSH_VERTICES().  (fixed?)
-    */
-   new_state = ctx->NewState | new_prog_state;
-   ctx->NewState = 0;
-   ctx->Driver.UpdateState(ctx, new_state);
-   ctx->Array.NewState = 0;
-}
-
-
-/* This is the usual entrypoint for state updates:
- */
-void
-_mesa_update_state( struct gl_context *ctx )
-{
-   _mesa_lock_context_textures(ctx);
-   _mesa_update_state_locked(ctx);
-   _mesa_unlock_context_textures(ctx);
-}
-
-
-
-
-/**
- * Want to figure out which fragment program inputs are actually
- * constant/current values from ctx->Current.  These should be
- * referenced as a tracked state variable rather than a fragment
- * program input, to save the overhead of putting a constant value in
- * every submitted vertex, transferring it to hardware, interpolating
- * it across the triangle, etc...
- *
- * When there is a VP bound, just use vp->outputs.  But when we're
- * generating vp from fixed function state, basically want to
- * calculate:
- *
- * vp_out_2_fp_in( vp_in_2_vp_out( varying_inputs ) | 
- *                 potential_vp_outputs )
- *
- * Where potential_vp_outputs is calculated by looking at enabled
- * texgen, etc.
- * 
- * The generated fragment program should then only declare inputs that
- * may vary or otherwise differ from the ctx->Current values.
- * Otherwise, the fp should track them as state values instead.
- */
-void
-_mesa_set_varying_vp_inputs( struct gl_context *ctx,
-                             GLbitfield varying_inputs )
-{
-   if (ctx->varying_vp_inputs != varying_inputs) {
-      ctx->varying_vp_inputs = varying_inputs;
-      ctx->NewState |= _NEW_ARRAY;
-      /*printf("%s %x\n", __FUNCTION__, varying_inputs);*/
-   }
-}
-
-
-/**
- * Used by drivers to tell core Mesa that the driver is going to
- * install/ use its own vertex program.  In particular, this will
- * prevent generated fragment programs from using state vars instead
- * of ordinary varyings/inputs.
- */
-void
-_mesa_set_vp_override(struct gl_context *ctx, GLboolean flag)
-{
-   if (ctx->VertexProgram._Overriden != flag) {
-      ctx->VertexProgram._Overriden = flag;
-
-      /* Set one of the bits which will trigger fragment program
-       * regeneration:
-       */
-      ctx->NewState |= _NEW_PROGRAM;
-   }
-}
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.3
+ *
+ * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/**
+ * \file state.c
+ * State management.
+ * 
+ * This file manages recalculation of derived values in struct gl_context.
+ */
+
+
+#include "glheader.h"
+#include "mtypes.h"
+#include "context.h"
+#include "debug.h"
+#include "macros.h"
+#include "ffvertex_prog.h"
+#include "framebuffer.h"
+#include "light.h"
+#include "matrix.h"
+#include "pixel.h"
+#include "program/program.h"
+#include "program/prog_parameter.h"
+#include "state.h"
+#include "stencil.h"
+#include "texenvprogram.h"
+#include "texobj.h"
+#include "texstate.h"
+
+
+static void
+update_separate_specular(struct gl_context *ctx)
+{
+   if (NEED_SECONDARY_COLOR(ctx))
+      ctx->_TriangleCaps |= DD_SEPARATE_SPECULAR;
+   else
+      ctx->_TriangleCaps &= ~DD_SEPARATE_SPECULAR;
+}
+
+
+/**
+ * Compute the index of the last array element that can be safely accessed
+ * in a vertex array.  We can really only do this when the array lives in
+ * a VBO.
+ * The array->_MaxElement field will be updated.
+ * Later in glDrawArrays/Elements/etc we can do some bounds checking.
+ */
+static void
+compute_max_element(struct gl_client_array *array)
+{
+   assert(array->Enabled);
+   if (array->BufferObj->Name) {
+      GLsizeiptrARB offset = (GLsizeiptrARB) array->Ptr;
+      GLsizeiptrARB obj_size = (GLsizeiptrARB) array->BufferObj->Size;
+
+      if (offset < obj_size) {
+	 array->_MaxElement = (obj_size - offset +
+			       array->StrideB -
+			       array->_ElementSize) / array->StrideB;
+      } else {
+	 array->_MaxElement = 0;
+      }
+   }
+   else {
+      /* user-space array, no idea how big it is */
+      array->_MaxElement = 2 * 1000 * 1000 * 1000; /* just a big number */
+   }
+}
+
+
+/**
+ * Helper for update_arrays().
+ * \return  min(current min, array->_MaxElement).
+ */
+static GLuint
+update_min(GLuint min, struct gl_client_array *array)
+{
+   compute_max_element(array);
+   return MIN2(min, array->_MaxElement);
+}
+
+
+/**
+ * Update ctx->Array._MaxElement (the max legal index into all enabled arrays).
+ * Need to do this upon new array state or new buffer object state.
+ */
+static void
+update_arrays( struct gl_context *ctx )
+{
+   struct gl_array_object *arrayObj = ctx->Array.ArrayObj;
+   GLuint i, min = ~0;
+
+   /* find min of _MaxElement values for all enabled arrays */
+
+   /* 0 */
+   if (ctx->VertexProgram._Current
+       && arrayObj->VertexAttrib[VERT_ATTRIB_POS].Enabled) {
+      min = update_min(min, &arrayObj->VertexAttrib[VERT_ATTRIB_POS]);
+   }
+   else if (arrayObj->Vertex.Enabled) {
+      min = update_min(min, &arrayObj->Vertex);
+   }
+
+   /* 1 */
+   if (ctx->VertexProgram._Enabled
+       && arrayObj->VertexAttrib[VERT_ATTRIB_WEIGHT].Enabled) {
+      min = update_min(min, &arrayObj->VertexAttrib[VERT_ATTRIB_WEIGHT]);
+   }
+   /* no conventional vertex weight array */
+
+   /* 2 */
+   if (ctx->VertexProgram._Enabled
+       && arrayObj->VertexAttrib[VERT_ATTRIB_NORMAL].Enabled) {
+      min = update_min(min, &arrayObj->VertexAttrib[VERT_ATTRIB_NORMAL]);
+   }
+   else if (arrayObj->Normal.Enabled) {
+      min = update_min(min, &arrayObj->Normal);
+   }
+
+   /* 3 */
+   if (ctx->VertexProgram._Enabled
+       && arrayObj->VertexAttrib[VERT_ATTRIB_COLOR0].Enabled) {
+      min = update_min(min, &arrayObj->VertexAttrib[VERT_ATTRIB_COLOR0]);
+   }
+   else if (arrayObj->Color.Enabled) {
+      min = update_min(min, &arrayObj->Color);
+   }
+
+   /* 4 */
+   if (ctx->VertexProgram._Enabled
+       && arrayObj->VertexAttrib[VERT_ATTRIB_COLOR1].Enabled) {
+      min = update_min(min, &arrayObj->VertexAttrib[VERT_ATTRIB_COLOR1]);
+   }
+   else if (arrayObj->SecondaryColor.Enabled) {
+      min = update_min(min, &arrayObj->SecondaryColor);
+   }
+
+   /* 5 */
+   if (ctx->VertexProgram._Enabled
+       && arrayObj->VertexAttrib[VERT_ATTRIB_FOG].Enabled) {
+      min = update_min(min, &arrayObj->VertexAttrib[VERT_ATTRIB_FOG]);
+   }
+   else if (arrayObj->FogCoord.Enabled) {
+      min = update_min(min, &arrayObj->FogCoord);
+   }
+
+   /* 6 */
+   if (ctx->VertexProgram._Enabled
+       && arrayObj->VertexAttrib[VERT_ATTRIB_COLOR_INDEX].Enabled) {
+      min = update_min(min, &arrayObj->VertexAttrib[VERT_ATTRIB_COLOR_INDEX]);
+   }
+   else if (arrayObj->Index.Enabled) {
+      min = update_min(min, &arrayObj->Index);
+   }
+
+   /* 7 */
+   if (ctx->VertexProgram._Enabled
+       && arrayObj->VertexAttrib[VERT_ATTRIB_EDGEFLAG].Enabled) {
+      min = update_min(min, &arrayObj->VertexAttrib[VERT_ATTRIB_EDGEFLAG]);
+   }
+
+   /* 8..15 */
+   for (i = VERT_ATTRIB_TEX0; i <= VERT_ATTRIB_TEX7; i++) {
+      if (ctx->VertexProgram._Enabled
+          && arrayObj->VertexAttrib[i].Enabled) {
+         min = update_min(min, &arrayObj->VertexAttrib[i]);
+      }
+      else if (i - VERT_ATTRIB_TEX0 < ctx->Const.MaxTextureCoordUnits
+               && arrayObj->TexCoord[i - VERT_ATTRIB_TEX0].Enabled) {
+         min = update_min(min, &arrayObj->TexCoord[i - VERT_ATTRIB_TEX0]);
+      }
+   }
+
+   /* 16..31 */
+   if (ctx->VertexProgram._Current) {
+      for (i = 0; i < Elements(arrayObj->VertexAttrib); i++) {
+         if (arrayObj->VertexAttrib[i].Enabled) {
+            min = update_min(min, &arrayObj->VertexAttrib[i]);
+         }
+      }
+   }
+
+   if (arrayObj->EdgeFlag.Enabled) {
+      min = update_min(min, &arrayObj->EdgeFlag);
+   }
+
+   /* _MaxElement is one past the last legal array element */
+   arrayObj->_MaxElement = min;
+}
+
+
+/**
+ * Update the following fields:
+ *   ctx->VertexProgram._Enabled
+ *   ctx->FragmentProgram._Enabled
+ *   ctx->ATIFragmentShader._Enabled
+ * This needs to be done before texture state validation.
+ */
+static void
+update_program_enables(struct gl_context *ctx)
+{
+   /* These _Enabled flags indicate if the program is enabled AND valid. */
+   ctx->VertexProgram._Enabled = ctx->VertexProgram.Enabled
+      && ctx->VertexProgram.Current->Base.Instructions;
+   ctx->FragmentProgram._Enabled = ctx->FragmentProgram.Enabled
+      && ctx->FragmentProgram.Current->Base.Instructions;
+   ctx->ATIFragmentShader._Enabled = ctx->ATIFragmentShader.Enabled
+      && ctx->ATIFragmentShader.Current->Instructions[0];
+}
+
+
+/**
+ * Update vertex/fragment program state.  In particular, update these fields:
+ *   ctx->VertexProgram._Current
+ *   ctx->VertexProgram._TnlProgram,
+ * These point to the highest priority enabled vertex/fragment program or are
+ * NULL if fixed-function processing is to be done.
+ *
+ * This function needs to be called after texture state validation in case
+ * we're generating a fragment program from fixed-function texture state.
+ *
+ * \return bitfield which will indicate _NEW_PROGRAM state if a new vertex
+ * or fragment program is being used.
+ */
+static GLbitfield
+update_program(struct gl_context *ctx)
+{
+   const struct gl_shader_program *vsProg = ctx->Shader.CurrentVertexProgram;
+   const struct gl_shader_program *gsProg = ctx->Shader.CurrentGeometryProgram;
+   const struct gl_shader_program *fsProg = ctx->Shader.CurrentFragmentProgram;
+   const struct gl_vertex_program *prevVP = ctx->VertexProgram._Current;
+   const struct gl_fragment_program *prevFP = ctx->FragmentProgram._Current;
+   const struct gl_geometry_program *prevGP = ctx->GeometryProgram._Current;
+   GLbitfield new_state = 0x0;
+
+   /*
+    * Set the ctx->VertexProgram._Current and ctx->FragmentProgram._Current
+    * pointers to the programs that should be used for rendering.  If either
+    * is NULL, use fixed-function code paths.
+    *
+    * These programs may come from several sources.  The priority is as
+    * follows:
+    *   1. OpenGL 2.0/ARB vertex/fragment shaders
+    *   2. ARB/NV vertex/fragment programs
+    *   3. Programs derived from fixed-function state.
+    *
+    * Note: it's possible for a vertex shader to get used with a fragment
+    * program (and vice versa) here, but in practice that shouldn't ever
+    * come up, or matter.
+    */
+
+   if (fsProg && fsProg->LinkStatus && fsProg->FragmentProgram) {
+      /* Use shader programs */
+      _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._Current,
+                               fsProg->FragmentProgram);
+   }
+   else if (ctx->FragmentProgram._Enabled) {
+      /* use user-defined vertex program */
+      _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._Current,
+                               ctx->FragmentProgram.Current);
+   }
+   else if (ctx->FragmentProgram._MaintainTexEnvProgram) {
+      /* Use fragment program generated from fixed-function state.
+       */
+      _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._Current,
+                               _mesa_get_fixed_func_fragment_program(ctx));
+      _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._TexEnvProgram,
+                               ctx->FragmentProgram._Current);
+   }
+   else {
+      /* no fragment program */
+      _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._Current, NULL);
+   }
+
+   if (gsProg && gsProg->LinkStatus && gsProg->GeometryProgram) {
+      /* Use shader programs */
+      _mesa_reference_geomprog(ctx, &ctx->GeometryProgram._Current,
+                               gsProg->GeometryProgram);
+   } else {
+      /* no fragment program */
+      _mesa_reference_geomprog(ctx, &ctx->GeometryProgram._Current, NULL);
+   }
+
+   /* Examine vertex program after fragment program as
+    * _mesa_get_fixed_func_vertex_program() needs to know active
+    * fragprog inputs.
+    */
+   if (vsProg && vsProg->LinkStatus && vsProg->VertexProgram) {
+      /* Use shader programs */
+      _mesa_reference_vertprog(ctx, &ctx->VertexProgram._Current,
+                            vsProg->VertexProgram);
+   }
+   else if (ctx->VertexProgram._Enabled) {
+      /* use user-defined vertex program */
+      _mesa_reference_vertprog(ctx, &ctx->VertexProgram._Current,
+                               ctx->VertexProgram.Current);
+   }
+   else if (ctx->VertexProgram._MaintainTnlProgram) {
+      /* Use vertex program generated from fixed-function state.
+       */
+      _mesa_reference_vertprog(ctx, &ctx->VertexProgram._Current,
+                               _mesa_get_fixed_func_vertex_program(ctx));
+      _mesa_reference_vertprog(ctx, &ctx->VertexProgram._TnlProgram,
+                               ctx->VertexProgram._Current);
+   }
+   else {
+      /* no vertex program */
+      _mesa_reference_vertprog(ctx, &ctx->VertexProgram._Current, NULL);
+   }
+
+   /* Let the driver know what's happening:
+    */
+   if (ctx->FragmentProgram._Current != prevFP) {
+      new_state |= _NEW_PROGRAM;
+      if (ctx->Driver.BindProgram) {
+         ctx->Driver.BindProgram(ctx, GL_FRAGMENT_PROGRAM_ARB,
+                          (struct gl_program *) ctx->FragmentProgram._Current);
+      }
+   }
+
+   if (ctx->GeometryProgram._Current != prevGP) {
+      new_state |= _NEW_PROGRAM;
+      if (ctx->Driver.BindProgram) {
+         ctx->Driver.BindProgram(ctx, MESA_GEOMETRY_PROGRAM,
+                            (struct gl_program *) ctx->GeometryProgram._Current);
+      }
+   }
+
+   if (ctx->VertexProgram._Current != prevVP) {
+      new_state |= _NEW_PROGRAM;
+      if (ctx->Driver.BindProgram) {
+         ctx->Driver.BindProgram(ctx, GL_VERTEX_PROGRAM_ARB,
+                            (struct gl_program *) ctx->VertexProgram._Current);
+      }
+   }
+
+   return new_state;
+}
+
+
+/**
+ * Examine shader constants and return either _NEW_PROGRAM_CONSTANTS or 0.
+ */
+static GLbitfield
+update_program_constants(struct gl_context *ctx)
+{
+   GLbitfield new_state = 0x0;
+
+   if (ctx->FragmentProgram._Current) {
+      const struct gl_program_parameter_list *params =
+         ctx->FragmentProgram._Current->Base.Parameters;
+      if (params && params->StateFlags & ctx->NewState) {
+         new_state |= _NEW_PROGRAM_CONSTANTS;
+      }
+   }
+
+   if (ctx->GeometryProgram._Current) {
+      const struct gl_program_parameter_list *params =
+         ctx->GeometryProgram._Current->Base.Parameters;
+      /*FIXME: StateFlags is always 0 because we have unnamed constant
+       *       not state changes */
+      if (params /*&& params->StateFlags & ctx->NewState*/) {
+         new_state |= _NEW_PROGRAM_CONSTANTS;
+      }
+   }
+
+   if (ctx->VertexProgram._Current) {
+      const struct gl_program_parameter_list *params =
+         ctx->VertexProgram._Current->Base.Parameters;
+      if (params && params->StateFlags & ctx->NewState) {
+         new_state |= _NEW_PROGRAM_CONSTANTS;
+      }
+   }
+
+   return new_state;
+}
+
+
+
+
+static void
+update_viewport_matrix(struct gl_context *ctx)
+{
+   const GLfloat depthMax = ctx->DrawBuffer->_DepthMaxF;
+
+   ASSERT(depthMax > 0);
+
+   /* Compute scale and bias values. This is really driver-specific
+    * and should be maintained elsewhere if at all.
+    * NOTE: RasterPos uses this.
+    */
+   _math_matrix_viewport(&ctx->Viewport._WindowMap,
+                         ctx->Viewport.X, ctx->Viewport.Y,
+                         ctx->Viewport.Width, ctx->Viewport.Height,
+                         ctx->Viewport.Near, ctx->Viewport.Far,
+                         depthMax);
+}
+
+
+/**
+ * Update derived multisample state.
+ */
+static void
+update_multisample(struct gl_context *ctx)
+{
+   ctx->Multisample._Enabled = GL_FALSE;
+   if (ctx->Multisample.Enabled &&
+       ctx->DrawBuffer &&
+       ctx->DrawBuffer->Visual.sampleBuffers)
+      ctx->Multisample._Enabled = GL_TRUE;
+}
+
+
+/**
+ * Update derived color/blend/logicop state.
+ */
+static void
+update_color(struct gl_context *ctx)
+{
+   /* This is needed to support 1.1's RGB logic ops AND
+    * 1.0's blending logicops.
+    */
+   ctx->Color._LogicOpEnabled = RGBA_LOGICOP_ENABLED(ctx);
+}
+
+
+/*
+ * Check polygon state and set DD_TRI_CULL_FRONT_BACK and/or DD_TRI_OFFSET
+ * in ctx->_TriangleCaps if needed.
+ */
+static void
+update_polygon(struct gl_context *ctx)
+{
+   ctx->_TriangleCaps &= ~(DD_TRI_CULL_FRONT_BACK | DD_TRI_OFFSET);
+
+   if (ctx->Polygon.CullFlag && ctx->Polygon.CullFaceMode == GL_FRONT_AND_BACK)
+      ctx->_TriangleCaps |= DD_TRI_CULL_FRONT_BACK;
+
+   if (   ctx->Polygon.OffsetPoint
+       || ctx->Polygon.OffsetLine
+       || ctx->Polygon.OffsetFill)
+      ctx->_TriangleCaps |= DD_TRI_OFFSET;
+}
+
+
+/**
+ * Update the ctx->_TriangleCaps bitfield.
+ * XXX that bitfield should really go away someday!
+ * This function must be called after other update_*() functions since
+ * there are dependencies on some other derived values.
+ */
+#if 0
+static void
+update_tricaps(struct gl_context *ctx, GLbitfield new_state)
+{
+   ctx->_TriangleCaps = 0;
+
+   /*
+    * Points
+    */
+   if (1/*new_state & _NEW_POINT*/) {
+      if (ctx->Point.SmoothFlag)
+         ctx->_TriangleCaps |= DD_POINT_SMOOTH;
+      if (ctx->Point._Attenuated)
+         ctx->_TriangleCaps |= DD_POINT_ATTEN;
+   }
+
+   /*
+    * Lines
+    */
+   if (1/*new_state & _NEW_LINE*/) {
+      if (ctx->Line.SmoothFlag)
+         ctx->_TriangleCaps |= DD_LINE_SMOOTH;
+      if (ctx->Line.StippleFlag)
+         ctx->_TriangleCaps |= DD_LINE_STIPPLE;
+   }
+
+   /*
+    * Polygons
+    */
+   if (1/*new_state & _NEW_POLYGON*/) {
+      if (ctx->Polygon.SmoothFlag)
+         ctx->_TriangleCaps |= DD_TRI_SMOOTH;
+      if (ctx->Polygon.StippleFlag)
+         ctx->_TriangleCaps |= DD_TRI_STIPPLE;
+      if (ctx->Polygon.FrontMode != GL_FILL
+          || ctx->Polygon.BackMode != GL_FILL)
+         ctx->_TriangleCaps |= DD_TRI_UNFILLED;
+      if (ctx->Polygon.CullFlag
+          && ctx->Polygon.CullFaceMode == GL_FRONT_AND_BACK)
+         ctx->_TriangleCaps |= DD_TRI_CULL_FRONT_BACK;
+      if (ctx->Polygon.OffsetPoint ||
+          ctx->Polygon.OffsetLine ||
+          ctx->Polygon.OffsetFill)
+         ctx->_TriangleCaps |= DD_TRI_OFFSET;
+   }
+
+   /*
+    * Lighting and shading
+    */
+   if (ctx->Light.Enabled && ctx->Light.Model.TwoSide)
+      ctx->_TriangleCaps |= DD_TRI_LIGHT_TWOSIDE;
+   if (ctx->Light.ShadeModel == GL_FLAT)
+      ctx->_TriangleCaps |= DD_FLATSHADE;
+   if (NEED_SECONDARY_COLOR(ctx))
+      ctx->_TriangleCaps |= DD_SEPARATE_SPECULAR;
+
+   /*
+    * Stencil
+    */
+   if (ctx->Stencil._TestTwoSide)
+      ctx->_TriangleCaps |= DD_TRI_TWOSTENCIL;
+}
+#endif
+
+
+/**
+ * Compute derived GL state.
+ * If __struct gl_contextRec::NewState is non-zero then this function \b must
+ * be called before rendering anything.
+ *
+ * Calls dd_function_table::UpdateState to perform any internal state
+ * management necessary.
+ * 
+ * \sa _mesa_update_modelview_project(), _mesa_update_texture(),
+ * _mesa_update_buffer_bounds(),
+ * _mesa_update_lighting() and _mesa_update_tnl_spaces().
+ */
+void
+_mesa_update_state_locked( struct gl_context *ctx )
+{
+   GLbitfield new_state = ctx->NewState;
+   GLbitfield prog_flags = _NEW_PROGRAM;
+   GLbitfield new_prog_state = 0x0;
+
+   if (new_state == _NEW_CURRENT_ATTRIB) 
+      goto out;
+
+   if (MESA_VERBOSE & VERBOSE_STATE)
+      _mesa_print_state("_mesa_update_state", new_state);
+
+   /* Determine which state flags effect vertex/fragment program state */
+   if (ctx->FragmentProgram._MaintainTexEnvProgram) {
+      prog_flags |= (_NEW_BUFFERS | _NEW_TEXTURE | _NEW_FOG |
+		     _NEW_ARRAY | _NEW_LIGHT | _NEW_POINT | _NEW_RENDERMODE |
+		     _NEW_PROGRAM);
+   }
+   if (ctx->VertexProgram._MaintainTnlProgram) {
+      prog_flags |= (_NEW_ARRAY | _NEW_TEXTURE | _NEW_TEXTURE_MATRIX |
+                     _NEW_TRANSFORM | _NEW_POINT |
+                     _NEW_FOG | _NEW_LIGHT |
+                     _MESA_NEW_NEED_EYE_COORDS);
+   }
+
+   /*
+    * Now update derived state info
+    */
+
+   if (new_state & prog_flags)
+      update_program_enables( ctx );
+
+   if (new_state & (_NEW_MODELVIEW|_NEW_PROJECTION))
+      _mesa_update_modelview_project( ctx, new_state );
+
+   if (new_state & (_NEW_PROGRAM|_NEW_TEXTURE|_NEW_TEXTURE_MATRIX))
+      _mesa_update_texture( ctx, new_state );
+
+   if (new_state & _NEW_BUFFERS)
+      _mesa_update_framebuffer(ctx);
+
+   if (new_state & (_NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT))
+      _mesa_update_draw_buffer_bounds( ctx );
+
+   if (new_state & _NEW_POLYGON)
+      update_polygon( ctx );
+
+   if (new_state & _NEW_LIGHT)
+      _mesa_update_lighting( ctx );
+
+   if (new_state & (_NEW_STENCIL | _NEW_BUFFERS))
+      _mesa_update_stencil( ctx );
+
+   if (new_state & _MESA_NEW_TRANSFER_STATE)
+      _mesa_update_pixel( ctx, new_state );
+
+   if (new_state & _DD_NEW_SEPARATE_SPECULAR)
+      update_separate_specular( ctx );
+
+   if (new_state & (_NEW_BUFFERS | _NEW_VIEWPORT))
+      update_viewport_matrix(ctx);
+
+   if (new_state & _NEW_MULTISAMPLE)
+      update_multisample( ctx );
+
+   if (new_state & _NEW_COLOR)
+      update_color( ctx );
+
+#if 0
+   if (new_state & (_NEW_POINT | _NEW_LINE | _NEW_POLYGON | _NEW_LIGHT
+                    | _NEW_STENCIL | _DD_NEW_SEPARATE_SPECULAR))
+      update_tricaps( ctx, new_state );
+#endif
+
+   /* ctx->_NeedEyeCoords is now up to date.
+    *
+    * If the truth value of this variable has changed, update for the
+    * new lighting space and recompute the positions of lights and the
+    * normal transform.
+    *
+    * If the lighting space hasn't changed, may still need to recompute
+    * light positions & normal transforms for other reasons.
+    */
+   if (new_state & _MESA_NEW_NEED_EYE_COORDS) 
+      _mesa_update_tnl_spaces( ctx, new_state );
+
+   if (new_state & prog_flags) {
+      /* When we generate programs from fixed-function vertex/fragment state
+       * this call may generate/bind a new program.  If so, we need to
+       * propogate the _NEW_PROGRAM flag to the driver.
+       */
+      new_prog_state |= update_program( ctx );
+   }
+
+   if (new_state & (_NEW_ARRAY | _NEW_PROGRAM | _NEW_BUFFER_OBJECT))
+      update_arrays( ctx );
+
+ out:
+   new_prog_state |= update_program_constants(ctx);
+
+   /*
+    * Give the driver a chance to act upon the new_state flags.
+    * The driver might plug in different span functions, for example.
+    * Also, this is where the driver can invalidate the state of any
+    * active modules (such as swrast_setup, swrast, tnl, etc).
+    *
+    * Set ctx->NewState to zero to avoid recursion if
+    * Driver.UpdateState() has to call FLUSH_VERTICES().  (fixed?)
+    */
+   new_state = ctx->NewState | new_prog_state;
+   ctx->NewState = 0;
+   ctx->Driver.UpdateState(ctx, new_state);
+   ctx->Array.NewState = 0;
+   if (!ctx->Array.RebindArrays)
+      ctx->Array.RebindArrays = (new_state & (_NEW_ARRAY | _NEW_PROGRAM)) != 0;
+}
+
+
+/* This is the usual entrypoint for state updates:
+ */
+void
+_mesa_update_state( struct gl_context *ctx )
+{
+   _mesa_lock_context_textures(ctx);
+   _mesa_update_state_locked(ctx);
+   _mesa_unlock_context_textures(ctx);
+}
+
+
+
+
+/**
+ * Want to figure out which fragment program inputs are actually
+ * constant/current values from ctx->Current.  These should be
+ * referenced as a tracked state variable rather than a fragment
+ * program input, to save the overhead of putting a constant value in
+ * every submitted vertex, transferring it to hardware, interpolating
+ * it across the triangle, etc...
+ *
+ * When there is a VP bound, just use vp->outputs.  But when we're
+ * generating vp from fixed function state, basically want to
+ * calculate:
+ *
+ * vp_out_2_fp_in( vp_in_2_vp_out( varying_inputs ) | 
+ *                 potential_vp_outputs )
+ *
+ * Where potential_vp_outputs is calculated by looking at enabled
+ * texgen, etc.
+ * 
+ * The generated fragment program should then only declare inputs that
+ * may vary or otherwise differ from the ctx->Current values.
+ * Otherwise, the fp should track them as state values instead.
+ */
+void
+_mesa_set_varying_vp_inputs( struct gl_context *ctx,
+                             GLbitfield varying_inputs )
+{
+   if (ctx->varying_vp_inputs != varying_inputs) {
+      ctx->varying_vp_inputs = varying_inputs;
+      ctx->NewState |= _NEW_ARRAY;
+      /*printf("%s %x\n", __FUNCTION__, varying_inputs);*/
+   }
+}
+
+
+/**
+ * Used by drivers to tell core Mesa that the driver is going to
+ * install/ use its own vertex program.  In particular, this will
+ * prevent generated fragment programs from using state vars instead
+ * of ordinary varyings/inputs.
+ */
+void
+_mesa_set_vp_override(struct gl_context *ctx, GLboolean flag)
+{
+   if (ctx->VertexProgram._Overriden != flag) {
+      ctx->VertexProgram._Overriden = flag;
+
+      /* Set one of the bits which will trigger fragment program
+       * regeneration:
+       */
+      ctx->NewState |= _NEW_PROGRAM;
+   }
+}
diff --git a/mesalib/src/mesa/state_tracker/st_atom_blend.c b/mesalib/src/mesa/state_tracker/st_atom_blend.c
index 26bb3dab9..fb1c7a4ef 100644
--- a/mesalib/src/mesa/state_tracker/st_atom_blend.c
+++ b/mesalib/src/mesa/state_tracker/st_atom_blend.c
@@ -1,299 +1,302 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  *   Brian Paul
-  */
- 
-
-#include "st_context.h"
-#include "st_atom.h"
-
-#include "pipe/p_context.h"
-#include "pipe/p_defines.h"
-#include "cso_cache/cso_context.h"
-
-#include "main/macros.h"
-
-/**
- * Convert GLenum blend tokens to pipe tokens.
- * Both blend factors and blend funcs are accepted.
- */
-static GLuint
-translate_blend(GLenum blend)
-{
-   switch (blend) {
-   /* blend functions */
-   case GL_FUNC_ADD:
-      return PIPE_BLEND_ADD;
-   case GL_FUNC_SUBTRACT:
-      return PIPE_BLEND_SUBTRACT;
-   case GL_FUNC_REVERSE_SUBTRACT:
-      return PIPE_BLEND_REVERSE_SUBTRACT;
-   case GL_MIN:
-      return PIPE_BLEND_MIN;
-   case GL_MAX:
-      return PIPE_BLEND_MAX;
-
-   /* blend factors */
-   case GL_ONE:
-      return PIPE_BLENDFACTOR_ONE;
-   case GL_SRC_COLOR:
-      return PIPE_BLENDFACTOR_SRC_COLOR;
-   case GL_SRC_ALPHA:
-      return PIPE_BLENDFACTOR_SRC_ALPHA;
-   case GL_DST_ALPHA:
-      return PIPE_BLENDFACTOR_DST_ALPHA;
-   case GL_DST_COLOR:
-      return PIPE_BLENDFACTOR_DST_COLOR;
-   case GL_SRC_ALPHA_SATURATE:
-      return PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE;
-   case GL_CONSTANT_COLOR:
-      return PIPE_BLENDFACTOR_CONST_COLOR;
-   case GL_CONSTANT_ALPHA:
-      return PIPE_BLENDFACTOR_CONST_ALPHA;
-      /*
-      return PIPE_BLENDFACTOR_SRC1_COLOR;
-      return PIPE_BLENDFACTOR_SRC1_ALPHA;
-      */
-   case GL_ZERO:
-      return PIPE_BLENDFACTOR_ZERO;
-   case GL_ONE_MINUS_SRC_COLOR:
-      return PIPE_BLENDFACTOR_INV_SRC_COLOR;
-   case GL_ONE_MINUS_SRC_ALPHA:
-      return PIPE_BLENDFACTOR_INV_SRC_ALPHA;
-   case GL_ONE_MINUS_DST_COLOR:
-      return PIPE_BLENDFACTOR_INV_DST_COLOR;
-   case GL_ONE_MINUS_DST_ALPHA:
-      return PIPE_BLENDFACTOR_INV_DST_ALPHA;
-   case GL_ONE_MINUS_CONSTANT_COLOR:
-      return PIPE_BLENDFACTOR_INV_CONST_COLOR;
-   case GL_ONE_MINUS_CONSTANT_ALPHA:
-      return PIPE_BLENDFACTOR_INV_CONST_ALPHA;
-      /*
-      return PIPE_BLENDFACTOR_INV_SRC1_COLOR;
-      return PIPE_BLENDFACTOR_INV_SRC1_ALPHA;
-      */
-   default:
-      assert("invalid GL token in translate_blend()" == NULL);
-      return 0;
-   }
-}
-
-
-/**
- * Convert GLenum logicop tokens to pipe tokens.
- */
-static GLuint
-translate_logicop(GLenum logicop)
-{
-   switch (logicop) {
-   case GL_CLEAR:
-      return PIPE_LOGICOP_CLEAR;
-   case GL_NOR:
-      return PIPE_LOGICOP_NOR;
-   case GL_AND_INVERTED:
-      return PIPE_LOGICOP_AND_INVERTED;
-   case GL_COPY_INVERTED:
-      return PIPE_LOGICOP_COPY_INVERTED;
-   case GL_AND_REVERSE:
-      return PIPE_LOGICOP_AND_REVERSE;
-   case GL_INVERT:
-      return PIPE_LOGICOP_INVERT;
-   case GL_XOR:
-      return PIPE_LOGICOP_XOR;
-   case GL_NAND:
-      return PIPE_LOGICOP_NAND;
-   case GL_AND:
-      return PIPE_LOGICOP_AND;
-   case GL_EQUIV:
-      return PIPE_LOGICOP_EQUIV;
-   case GL_NOOP:
-      return PIPE_LOGICOP_NOOP;
-   case GL_OR_INVERTED:
-      return PIPE_LOGICOP_OR_INVERTED;
-   case GL_COPY:
-      return PIPE_LOGICOP_COPY;
-   case GL_OR_REVERSE:
-      return PIPE_LOGICOP_OR_REVERSE;
-   case GL_OR:
-      return PIPE_LOGICOP_OR;
-   case GL_SET:
-      return PIPE_LOGICOP_SET;
-   default:
-      assert("invalid GL token in translate_logicop()" == NULL);
-      return 0;
-   }
-}
-
-/**
- * Figure out if colormasks are different per rt.
- */
-static GLboolean
-colormask_per_rt(struct gl_context *ctx)
-{
-   /* a bit suboptimal have to compare lots of values */
-   unsigned i;
-   for (i = 1; i < ctx->Const.MaxDrawBuffers; i++) {
-      if (memcmp(ctx->Color.ColorMask[0], ctx->Color.ColorMask[i], 4)) {
-         return GL_TRUE;
-      }
-   }
-   return GL_FALSE;
-}
-
-/**
- * Figure out if blend enables/state are different per rt.
- */
-static GLboolean
-blend_per_rt(struct gl_context *ctx)
-{
-   if (ctx->Color.BlendEnabled &&
-      (ctx->Color.BlendEnabled != ((1 << ctx->Const.MaxDrawBuffers) - 1))) {
-      /* This can only happen if GL_EXT_draw_buffers2 is enabled */
-      return GL_TRUE;
-   }
-   if (ctx->Color._BlendFuncPerBuffer || ctx->Color._BlendEquationPerBuffer) {
-      /* this can only happen if GL_ARB_draw_buffers_blend is enabled */
-      return GL_TRUE;
-   }
-   return GL_FALSE;
-}
-
-static void 
-update_blend( struct st_context *st )
-{
-   struct pipe_blend_state *blend = &st->state.blend;
-   unsigned num_state = 1;
-   unsigned i;
-
-   memset(blend, 0, sizeof(*blend));
-
-   if (blend_per_rt(st->ctx) || colormask_per_rt(st->ctx)) {
-      num_state = st->ctx->Const.MaxDrawBuffers;
-      blend->independent_blend_enable = 1;
-   }
-   /* Note it is impossible to correctly deal with EXT_blend_logic_op and
-      EXT_draw_buffers2/EXT_blend_equation_separate at the same time.
-      These combinations would require support for per-rt logicop enables
-      and separate alpha/rgb logicop/blend support respectively. Neither
-      possible in gallium nor most hardware. Assume these combinations
-      don't happen. */
-   if (st->ctx->Color.ColorLogicOpEnabled ||
-       (st->ctx->Color.BlendEnabled &&
-        st->ctx->Color.Blend[0].EquationRGB == GL_LOGIC_OP)) {
-      /* logicop enabled */
-      blend->logicop_enable = 1;
-      blend->logicop_func = translate_logicop(st->ctx->Color.LogicOp);
-   }
-   else if (st->ctx->Color.BlendEnabled) {
-      /* blending enabled */
-      for (i = 0; i < num_state; i++) {
-
-         blend->rt[i].blend_enable = (st->ctx->Color.BlendEnabled >> i) & 0x1;
-
-         blend->rt[i].rgb_func =
-            translate_blend(st->ctx->Color.Blend[i].EquationRGB);
-
-         if (st->ctx->Color.Blend[i].EquationRGB == GL_MIN ||
-             st->ctx->Color.Blend[i].EquationRGB == GL_MAX) {
-            /* Min/max are special */
-            blend->rt[i].rgb_src_factor = PIPE_BLENDFACTOR_ONE;
-            blend->rt[i].rgb_dst_factor = PIPE_BLENDFACTOR_ONE;
-         }
-         else {
-            blend->rt[i].rgb_src_factor =
-               translate_blend(st->ctx->Color.Blend[i].SrcRGB);
-            blend->rt[i].rgb_dst_factor =
-               translate_blend(st->ctx->Color.Blend[i].DstRGB);
-         }
-
-         blend->rt[i].alpha_func =
-            translate_blend(st->ctx->Color.Blend[i].EquationA);
-
-         if (st->ctx->Color.Blend[i].EquationA == GL_MIN ||
-             st->ctx->Color.Blend[i].EquationA == GL_MAX) {
-            /* Min/max are special */
-            blend->rt[i].alpha_src_factor = PIPE_BLENDFACTOR_ONE;
-            blend->rt[i].alpha_dst_factor = PIPE_BLENDFACTOR_ONE;
-         }
-         else {
-            blend->rt[i].alpha_src_factor =
-               translate_blend(st->ctx->Color.Blend[i].SrcA);
-            blend->rt[i].alpha_dst_factor =
-               translate_blend(st->ctx->Color.Blend[i].DstA);
-         }
-      }
-   }
-   else {
-      /* no blending / logicop */
-   }
-
-   /* Colormask - maybe reverse these bits? */
-   for (i = 0; i < num_state; i++) {
-      if (st->ctx->Color.ColorMask[i][0])
-         blend->rt[i].colormask |= PIPE_MASK_R;
-      if (st->ctx->Color.ColorMask[i][1])
-         blend->rt[i].colormask |= PIPE_MASK_G;
-      if (st->ctx->Color.ColorMask[i][2])
-         blend->rt[i].colormask |= PIPE_MASK_B;
-      if (st->ctx->Color.ColorMask[i][3])
-         blend->rt[i].colormask |= PIPE_MASK_A;
-   }
-
-   if (st->ctx->Color.DitherFlag)
-      blend->dither = 1;
-
-   if (st->ctx->Multisample.Enabled) {
-      /* unlike in gallium/d3d10 these operations are only performed
-         if msaa is enabled */
-      if (st->ctx->Multisample.SampleAlphaToCoverage)
-         blend->alpha_to_coverage = 1;
-      if (st->ctx->Multisample.SampleAlphaToOne)
-         blend->alpha_to_one = 1;
-   }
-
-   cso_set_blend(st->cso_context, blend);
-
-   {
-      struct pipe_blend_color bc;
-      COPY_4FV(bc.color, st->ctx->Color.BlendColor);
-      cso_set_blend_color(st->cso_context, &bc);
-   }
-}
-
-
-const struct st_tracked_state st_update_blend = {
-   "st_update_blend",					/* name */
-   {							/* dirty */
-      (_NEW_COLOR | _NEW_MULTISAMPLE),  /* XXX _NEW_BLEND someday? */	/* mesa */
-      0,						/* st */
-   },
-   update_blend,					/* update */
-};
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Brian Paul
+  */
+ 
+
+#include "st_context.h"
+#include "st_atom.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "cso_cache/cso_context.h"
+
+#include "main/macros.h"
+
+/**
+ * Convert GLenum blend tokens to pipe tokens.
+ * Both blend factors and blend funcs are accepted.
+ */
+static GLuint
+translate_blend(GLenum blend)
+{
+   switch (blend) {
+   /* blend functions */
+   case GL_FUNC_ADD:
+      return PIPE_BLEND_ADD;
+   case GL_FUNC_SUBTRACT:
+      return PIPE_BLEND_SUBTRACT;
+   case GL_FUNC_REVERSE_SUBTRACT:
+      return PIPE_BLEND_REVERSE_SUBTRACT;
+   case GL_MIN:
+      return PIPE_BLEND_MIN;
+   case GL_MAX:
+      return PIPE_BLEND_MAX;
+
+   /* blend factors */
+   case GL_ONE:
+      return PIPE_BLENDFACTOR_ONE;
+   case GL_SRC_COLOR:
+      return PIPE_BLENDFACTOR_SRC_COLOR;
+   case GL_SRC_ALPHA:
+      return PIPE_BLENDFACTOR_SRC_ALPHA;
+   case GL_DST_ALPHA:
+      return PIPE_BLENDFACTOR_DST_ALPHA;
+   case GL_DST_COLOR:
+      return PIPE_BLENDFACTOR_DST_COLOR;
+   case GL_SRC_ALPHA_SATURATE:
+      return PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE;
+   case GL_CONSTANT_COLOR:
+      return PIPE_BLENDFACTOR_CONST_COLOR;
+   case GL_CONSTANT_ALPHA:
+      return PIPE_BLENDFACTOR_CONST_ALPHA;
+      /*
+      return PIPE_BLENDFACTOR_SRC1_COLOR;
+      return PIPE_BLENDFACTOR_SRC1_ALPHA;
+      */
+   case GL_ZERO:
+      return PIPE_BLENDFACTOR_ZERO;
+   case GL_ONE_MINUS_SRC_COLOR:
+      return PIPE_BLENDFACTOR_INV_SRC_COLOR;
+   case GL_ONE_MINUS_SRC_ALPHA:
+      return PIPE_BLENDFACTOR_INV_SRC_ALPHA;
+   case GL_ONE_MINUS_DST_COLOR:
+      return PIPE_BLENDFACTOR_INV_DST_COLOR;
+   case GL_ONE_MINUS_DST_ALPHA:
+      return PIPE_BLENDFACTOR_INV_DST_ALPHA;
+   case GL_ONE_MINUS_CONSTANT_COLOR:
+      return PIPE_BLENDFACTOR_INV_CONST_COLOR;
+   case GL_ONE_MINUS_CONSTANT_ALPHA:
+      return PIPE_BLENDFACTOR_INV_CONST_ALPHA;
+      /*
+      return PIPE_BLENDFACTOR_INV_SRC1_COLOR;
+      return PIPE_BLENDFACTOR_INV_SRC1_ALPHA;
+      */
+   default:
+      assert("invalid GL token in translate_blend()" == NULL);
+      return 0;
+   }
+}
+
+
+/**
+ * Convert GLenum logicop tokens to pipe tokens.
+ */
+static GLuint
+translate_logicop(GLenum logicop)
+{
+   switch (logicop) {
+   case GL_CLEAR:
+      return PIPE_LOGICOP_CLEAR;
+   case GL_NOR:
+      return PIPE_LOGICOP_NOR;
+   case GL_AND_INVERTED:
+      return PIPE_LOGICOP_AND_INVERTED;
+   case GL_COPY_INVERTED:
+      return PIPE_LOGICOP_COPY_INVERTED;
+   case GL_AND_REVERSE:
+      return PIPE_LOGICOP_AND_REVERSE;
+   case GL_INVERT:
+      return PIPE_LOGICOP_INVERT;
+   case GL_XOR:
+      return PIPE_LOGICOP_XOR;
+   case GL_NAND:
+      return PIPE_LOGICOP_NAND;
+   case GL_AND:
+      return PIPE_LOGICOP_AND;
+   case GL_EQUIV:
+      return PIPE_LOGICOP_EQUIV;
+   case GL_NOOP:
+      return PIPE_LOGICOP_NOOP;
+   case GL_OR_INVERTED:
+      return PIPE_LOGICOP_OR_INVERTED;
+   case GL_COPY:
+      return PIPE_LOGICOP_COPY;
+   case GL_OR_REVERSE:
+      return PIPE_LOGICOP_OR_REVERSE;
+   case GL_OR:
+      return PIPE_LOGICOP_OR;
+   case GL_SET:
+      return PIPE_LOGICOP_SET;
+   default:
+      assert("invalid GL token in translate_logicop()" == NULL);
+      return 0;
+   }
+}
+
+/**
+ * Figure out if colormasks are different per rt.
+ */
+static GLboolean
+colormask_per_rt(struct gl_context *ctx)
+{
+   /* a bit suboptimal have to compare lots of values */
+   unsigned i;
+   for (i = 1; i < ctx->Const.MaxDrawBuffers; i++) {
+      if (memcmp(ctx->Color.ColorMask[0], ctx->Color.ColorMask[i], 4)) {
+         return GL_TRUE;
+      }
+   }
+   return GL_FALSE;
+}
+
+/**
+ * Figure out if blend enables/state are different per rt.
+ */
+static GLboolean
+blend_per_rt(struct gl_context *ctx)
+{
+   if (ctx->Color.BlendEnabled &&
+      (ctx->Color.BlendEnabled != ((1 << ctx->Const.MaxDrawBuffers) - 1))) {
+      /* This can only happen if GL_EXT_draw_buffers2 is enabled */
+      return GL_TRUE;
+   }
+   if (ctx->Color._BlendFuncPerBuffer || ctx->Color._BlendEquationPerBuffer) {
+      /* this can only happen if GL_ARB_draw_buffers_blend is enabled */
+      return GL_TRUE;
+   }
+   return GL_FALSE;
+}
+
+static void 
+update_blend( struct st_context *st )
+{
+   struct pipe_blend_state *blend = &st->state.blend;
+   unsigned num_state = 1;
+   unsigned i, j;
+
+   memset(blend, 0, sizeof(*blend));
+
+   if (blend_per_rt(st->ctx) || colormask_per_rt(st->ctx)) {
+      num_state = st->ctx->Const.MaxDrawBuffers;
+      blend->independent_blend_enable = 1;
+   }
+   /* Note it is impossible to correctly deal with EXT_blend_logic_op and
+      EXT_draw_buffers2/EXT_blend_equation_separate at the same time.
+      These combinations would require support for per-rt logicop enables
+      and separate alpha/rgb logicop/blend support respectively. Neither
+      possible in gallium nor most hardware. Assume these combinations
+      don't happen. */
+   if (st->ctx->Color.ColorLogicOpEnabled ||
+       (st->ctx->Color.BlendEnabled &&
+        st->ctx->Color.Blend[0].EquationRGB == GL_LOGIC_OP)) {
+      /* logicop enabled */
+      blend->logicop_enable = 1;
+      blend->logicop_func = translate_logicop(st->ctx->Color.LogicOp);
+   }
+   else if (st->ctx->Color.BlendEnabled) {
+      /* blending enabled */
+      for (i = 0, j = 0; i < num_state; i++) {
+
+         blend->rt[i].blend_enable = (st->ctx->Color.BlendEnabled >> i) & 0x1;
+
+         if (st->ctx->Extensions.ARB_draw_buffers_blend)
+            j = i;
+
+         blend->rt[i].rgb_func =
+            translate_blend(st->ctx->Color.Blend[j].EquationRGB);
+
+         if (st->ctx->Color.Blend[i].EquationRGB == GL_MIN ||
+             st->ctx->Color.Blend[i].EquationRGB == GL_MAX) {
+            /* Min/max are special */
+            blend->rt[i].rgb_src_factor = PIPE_BLENDFACTOR_ONE;
+            blend->rt[i].rgb_dst_factor = PIPE_BLENDFACTOR_ONE;
+         }
+         else {
+            blend->rt[i].rgb_src_factor =
+               translate_blend(st->ctx->Color.Blend[j].SrcRGB);
+            blend->rt[i].rgb_dst_factor =
+               translate_blend(st->ctx->Color.Blend[j].DstRGB);
+         }
+
+         blend->rt[i].alpha_func =
+            translate_blend(st->ctx->Color.Blend[j].EquationA);
+
+         if (st->ctx->Color.Blend[i].EquationA == GL_MIN ||
+             st->ctx->Color.Blend[i].EquationA == GL_MAX) {
+            /* Min/max are special */
+            blend->rt[i].alpha_src_factor = PIPE_BLENDFACTOR_ONE;
+            blend->rt[i].alpha_dst_factor = PIPE_BLENDFACTOR_ONE;
+         }
+         else {
+            blend->rt[i].alpha_src_factor =
+               translate_blend(st->ctx->Color.Blend[j].SrcA);
+            blend->rt[i].alpha_dst_factor =
+               translate_blend(st->ctx->Color.Blend[j].DstA);
+         }
+      }
+   }
+   else {
+      /* no blending / logicop */
+   }
+
+   /* Colormask - maybe reverse these bits? */
+   for (i = 0; i < num_state; i++) {
+      if (st->ctx->Color.ColorMask[i][0])
+         blend->rt[i].colormask |= PIPE_MASK_R;
+      if (st->ctx->Color.ColorMask[i][1])
+         blend->rt[i].colormask |= PIPE_MASK_G;
+      if (st->ctx->Color.ColorMask[i][2])
+         blend->rt[i].colormask |= PIPE_MASK_B;
+      if (st->ctx->Color.ColorMask[i][3])
+         blend->rt[i].colormask |= PIPE_MASK_A;
+   }
+
+   if (st->ctx->Color.DitherFlag)
+      blend->dither = 1;
+
+   if (st->ctx->Multisample.Enabled) {
+      /* unlike in gallium/d3d10 these operations are only performed
+         if msaa is enabled */
+      if (st->ctx->Multisample.SampleAlphaToCoverage)
+         blend->alpha_to_coverage = 1;
+      if (st->ctx->Multisample.SampleAlphaToOne)
+         blend->alpha_to_one = 1;
+   }
+
+   cso_set_blend(st->cso_context, blend);
+
+   {
+      struct pipe_blend_color bc;
+      COPY_4FV(bc.color, st->ctx->Color.BlendColor);
+      cso_set_blend_color(st->cso_context, &bc);
+   }
+}
+
+
+const struct st_tracked_state st_update_blend = {
+   "st_update_blend",					/* name */
+   {							/* dirty */
+      (_NEW_COLOR | _NEW_MULTISAMPLE),  /* XXX _NEW_BLEND someday? */	/* mesa */
+      0,						/* st */
+   },
+   update_blend,					/* update */
+};
diff --git a/mesalib/src/mesa/state_tracker/st_cb_bitmap.c b/mesalib/src/mesa/state_tracker/st_cb_bitmap.c
index ddd130a81..0ea567155 100644
--- a/mesalib/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/mesalib/src/mesa/state_tracker/st_cb_bitmap.c
@@ -1,890 +1,893 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Brian Paul
-  */
-
-#include "main/imports.h"
-#include "main/image.h"
-#include "main/bufferobj.h"
-#include "main/macros.h"
-#include "main/mfeatures.h"
-#include "program/program.h"
-#include "program/prog_print.h"
-
-#include "st_context.h"
-#include "st_atom.h"
-#include "st_atom_constbuf.h"
-#include "st_program.h"
-#include "st_cb_bitmap.h"
-#include "st_texture.h"
-
-#include "pipe/p_context.h"
-#include "pipe/p_defines.h"
-#include "pipe/p_shader_tokens.h"
-#include "util/u_inlines.h"
-#include "util/u_draw_quad.h"
-#include "util/u_simple_shaders.h"
-#include "program/prog_instruction.h"
-#include "cso_cache/cso_context.h"
-
-
-#if FEATURE_drawpix
-
-/**
- * glBitmaps are drawn as textured quads.  The user's bitmap pattern
- * is stored in a texture image.  An alpha8 texture format is used.
- * The fragment shader samples a bit (texel) from the texture, then
- * discards the fragment if the bit is off.
- *
- * Note that we actually store the inverse image of the bitmap to
- * simplify the fragment program.  An "on" bit gets stored as texel=0x0
- * and an "off" bit is stored as texel=0xff.  Then we kill the
- * fragment if the negated texel value is less than zero.
- */
-
-
-/**
- * The bitmap cache attempts to accumulate multiple glBitmap calls in a
- * buffer which is then rendered en mass upon a flush, state change, etc.
- * A wide, short buffer is used to target the common case of a series
- * of glBitmap calls being used to draw text.
- */
-static GLboolean UseBitmapCache = GL_TRUE;
-
-
-#define BITMAP_CACHE_WIDTH  512
-#define BITMAP_CACHE_HEIGHT 32
-
-struct bitmap_cache
-{
-   /** Window pos to render the cached image */
-   GLint xpos, ypos;
-   /** Bounds of region used in window coords */
-   GLint xmin, ymin, xmax, ymax;
-
-   GLfloat color[4];
-
-   /** Bitmap's Z position */
-   GLfloat zpos;
-
-   struct pipe_resource *texture;
-   struct pipe_transfer *trans;
-
-   GLboolean empty;
-
-   /** An I8 texture image: */
-   ubyte *buffer;
-};
-
-
-/** Epsilon for Z comparisons */
-#define Z_EPSILON 1e-06
-
-
-/**
- * Make fragment program for glBitmap:
- *   Sample the texture and kill the fragment if the bit is 0.
- * This program will be combined with the user's fragment program.
- */
-static struct st_fragment_program *
-make_bitmap_fragment_program(struct gl_context *ctx, GLuint samplerIndex)
-{
-   struct st_context *st = st_context(ctx);
-   struct st_fragment_program *stfp;
-   struct gl_program *p;
-   GLuint ic = 0;
-
-   p = ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0);
-   if (!p)
-      return NULL;
-
-   p->NumInstructions = 3;
-
-   p->Instructions = _mesa_alloc_instructions(p->NumInstructions);
-   if (!p->Instructions) {
-      ctx->Driver.DeleteProgram(ctx, p);
-      return NULL;
-   }
-   _mesa_init_instructions(p->Instructions, p->NumInstructions);
-
-   /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */
-   p->Instructions[ic].Opcode = OPCODE_TEX;
-   p->Instructions[ic].DstReg.File = PROGRAM_TEMPORARY;
-   p->Instructions[ic].DstReg.Index = 0;
-   p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT;
-   p->Instructions[ic].SrcReg[0].Index = FRAG_ATTRIB_TEX0;
-   p->Instructions[ic].TexSrcUnit = samplerIndex;
-   p->Instructions[ic].TexSrcTarget = TEXTURE_2D_INDEX;
-   ic++;
-
-   /* KIL if -tmp0 < 0 # texel=0 -> keep / texel=0 -> discard */
-   p->Instructions[ic].Opcode = OPCODE_KIL;
-   p->Instructions[ic].SrcReg[0].File = PROGRAM_TEMPORARY;
-
-   if (st->bitmap.tex_format == PIPE_FORMAT_L8_UNORM)
-      p->Instructions[ic].SrcReg[0].Swizzle = SWIZZLE_XXXX;
-
-   p->Instructions[ic].SrcReg[0].Index = 0;
-   p->Instructions[ic].SrcReg[0].Negate = NEGATE_XYZW;
-   ic++;
-
-   /* END; */
-   p->Instructions[ic++].Opcode = OPCODE_END;
-
-   assert(ic == p->NumInstructions);
-
-   p->InputsRead = FRAG_BIT_TEX0;
-   p->OutputsWritten = 0x0;
-   p->SamplersUsed = (1 << samplerIndex);
-
-   stfp = (struct st_fragment_program *) p;
-   stfp->Base.UsesKill = GL_TRUE;
-
-   return stfp;
-}
-
-
-static int
-find_free_bit(uint bitfield)
-{
-   int i;
-   for (i = 0; i < 32; i++) {
-      if ((bitfield & (1 << i)) == 0) {
-         return i;
-      }
-   }
-   return -1;
-}
-
-
-/**
- * Combine basic bitmap fragment program with the user-defined program.
- * \param st  current context
- * \param fpIn  the incoming fragment program
- * \param fpOut  the new fragment program which does fragment culling
- * \param bitmap_sampler  sampler number for the bitmap texture
- */
-void
-st_make_bitmap_fragment_program(struct st_context *st,
-                                struct gl_fragment_program *fpIn,
-                                struct gl_fragment_program **fpOut,
-                                GLuint *bitmap_sampler)
-{
-   struct st_fragment_program *bitmap_prog;
-   struct gl_program *newProg;
-   uint sampler;
-
-   /*
-    * Generate new program which is the user-defined program prefixed
-    * with the bitmap sampler/kill instructions.
-    */
-   sampler = find_free_bit(fpIn->Base.SamplersUsed);
-   bitmap_prog = make_bitmap_fragment_program(st->ctx, sampler);
-
-   newProg = _mesa_combine_programs(st->ctx,
-                                    &bitmap_prog->Base.Base,
-                                    &fpIn->Base);
-   /* done with this after combining */
-   st_reference_fragprog(st, &bitmap_prog, NULL);
-
-#if 0
-   {
-      printf("Combined bitmap program:\n");
-      _mesa_print_program(newProg);
-      printf("InputsRead: 0x%x\n", newProg->InputsRead);
-      printf("OutputsWritten: 0x%x\n", newProg->OutputsWritten);
-      _mesa_print_parameter_list(newProg->Parameters);
-   }
-#endif
-
-   /* return results */
-   *fpOut = (struct gl_fragment_program *) newProg;
-   *bitmap_sampler = sampler;
-}
-
-
-/**
- * Copy user-provide bitmap bits into texture buffer, expanding
- * bits into texels.
- * "On" bits will set texels to 0x0.
- * "Off" bits will not modify texels.
- * Note that the image is actually going to be upside down in
- * the texture.  We deal with that with texcoords.
- */
-static void
-unpack_bitmap(struct st_context *st,
-              GLint px, GLint py, GLsizei width, GLsizei height,
-              const struct gl_pixelstore_attrib *unpack,
-              const GLubyte *bitmap,
-              ubyte *destBuffer, uint destStride)
-{
-   destBuffer += py * destStride + px;
-
-   _mesa_expand_bitmap(width, height, unpack, bitmap,
-                       destBuffer, destStride, 0x0);
-}
-
-
-/**
- * Create a texture which represents a bitmap image.
- */
-static struct pipe_resource *
-make_bitmap_texture(struct gl_context *ctx, GLsizei width, GLsizei height,
-                    const struct gl_pixelstore_attrib *unpack,
-                    const GLubyte *bitmap)
-{
-   struct st_context *st = st_context(ctx);
-   struct pipe_context *pipe = st->pipe;
-   struct pipe_transfer *transfer;
-   ubyte *dest;
-   struct pipe_resource *pt;
-
-   /* PBO source... */
-   bitmap = _mesa_map_pbo_source(ctx, unpack, bitmap);
-   if (!bitmap) {
-      return NULL;
-   }
-
-   /**
-    * Create texture to hold bitmap pattern.
-    */
-   pt = st_texture_create(st, st->internal_target, st->bitmap.tex_format,
-                          0, width, height, 1, 1,
-                          PIPE_BIND_SAMPLER_VIEW);
-   if (!pt) {
-      _mesa_unmap_pbo_source(ctx, unpack);
-      return NULL;
-   }
-
-   transfer = pipe_get_transfer(st->pipe, pt, 0, 0,
-                                PIPE_TRANSFER_WRITE,
-                                0, 0, width, height);
-
-   dest = pipe_transfer_map(pipe, transfer);
-
-   /* Put image into texture transfer */
-   memset(dest, 0xff, height * transfer->stride);
-   unpack_bitmap(st, 0, 0, width, height, unpack, bitmap,
-                 dest, transfer->stride);
-
-   _mesa_unmap_pbo_source(ctx, unpack);
-
-   /* Release transfer */
-   pipe_transfer_unmap(pipe, transfer);
-   pipe->transfer_destroy(pipe, transfer);
-
-   return pt;
-}
-
-static GLuint
-setup_bitmap_vertex_data(struct st_context *st, bool normalized,
-                         int x, int y, int width, int height,
-                         float z, const float color[4])
-{
-   struct pipe_context *pipe = st->pipe;
-   const struct gl_framebuffer *fb = st->ctx->DrawBuffer;
-   const GLfloat fb_width = (GLfloat)fb->Width;
-   const GLfloat fb_height = (GLfloat)fb->Height;
-   const GLfloat x0 = (GLfloat)x;
-   const GLfloat x1 = (GLfloat)(x + width);
-   const GLfloat y0 = (GLfloat)y;
-   const GLfloat y1 = (GLfloat)(y + height);
-   GLfloat sLeft = (GLfloat)0.0, sRight = (GLfloat)1.0;
-   GLfloat tTop = (GLfloat)0.0, tBot = (GLfloat)1.0 - tTop;
-   const GLfloat clip_x0 = (GLfloat)(x0 / fb_width * 2.0 - 1.0);
-   const GLfloat clip_y0 = (GLfloat)(y0 / fb_height * 2.0 - 1.0);
-   const GLfloat clip_x1 = (GLfloat)(x1 / fb_width * 2.0 - 1.0);
-   const GLfloat clip_y1 = (GLfloat)(y1 / fb_height * 2.0 - 1.0);
-   const GLuint max_slots = 1; /* 4096 / sizeof(st->bitmap.vertices); */
-   GLuint i;
-
-   if(!normalized)
-   {
-      sRight = width;
-      tBot = height;
-   }
-
-   /* XXX: Need to improve buffer_write to allow NO_WAIT (as well as
-    * no_flush) updates to buffers where we know there is no conflict
-    * with previous data.  Currently using max_slots > 1 will cause
-    * synchronous rendering if the driver flushes its command buffers
-    * between one bitmap and the next.  Our flush hook below isn't
-    * sufficient to catch this as the driver doesn't tell us when it
-    * flushes its own command buffers.  Until this gets fixed, pay the
-    * price of allocating a new buffer for each bitmap cache-flush to
-    * avoid synchronous rendering.
-    */
-   if (st->bitmap.vbuf_slot >= max_slots) {
-      pipe_resource_reference(&st->bitmap.vbuf, NULL);
-      st->bitmap.vbuf_slot = 0;
-   }
-
-   if (!st->bitmap.vbuf) {
-      st->bitmap.vbuf = pipe_buffer_create(pipe->screen, 
-                                           PIPE_BIND_VERTEX_BUFFER,
-                                           max_slots *
-                                           sizeof(st->bitmap.vertices));
-   }
-
-   /* Positions are in clip coords since we need to do clipping in case
-    * the bitmap quad goes beyond the window bounds.
-    */
-   st->bitmap.vertices[0][0][0] = clip_x0;
-   st->bitmap.vertices[0][0][1] = clip_y0;
-   st->bitmap.vertices[0][2][0] = sLeft;
-   st->bitmap.vertices[0][2][1] = tTop;
-
-   st->bitmap.vertices[1][0][0] = clip_x1;
-   st->bitmap.vertices[1][0][1] = clip_y0;
-   st->bitmap.vertices[1][2][0] = sRight;
-   st->bitmap.vertices[1][2][1] = tTop;
-   
-   st->bitmap.vertices[2][0][0] = clip_x1;
-   st->bitmap.vertices[2][0][1] = clip_y1;
-   st->bitmap.vertices[2][2][0] = sRight;
-   st->bitmap.vertices[2][2][1] = tBot;
-   
-   st->bitmap.vertices[3][0][0] = clip_x0;
-   st->bitmap.vertices[3][0][1] = clip_y1;
-   st->bitmap.vertices[3][2][0] = sLeft;
-   st->bitmap.vertices[3][2][1] = tBot;
-   
-   /* same for all verts: */
-   for (i = 0; i < 4; i++) {
-      st->bitmap.vertices[i][0][2] = z;
-      st->bitmap.vertices[i][0][3] = 1.0;
-      st->bitmap.vertices[i][1][0] = color[0];
-      st->bitmap.vertices[i][1][1] = color[1];
-      st->bitmap.vertices[i][1][2] = color[2];
-      st->bitmap.vertices[i][1][3] = color[3];
-      st->bitmap.vertices[i][2][2] = 0.0; /*R*/
-      st->bitmap.vertices[i][2][3] = 1.0; /*Q*/
-   }
-
-   /* put vertex data into vbuf */
-   pipe_buffer_write_nooverlap(st->pipe,
-                               st->bitmap.vbuf,
-                               st->bitmap.vbuf_slot
-                               * sizeof(st->bitmap.vertices),
-                               sizeof st->bitmap.vertices,
-                               st->bitmap.vertices);
-
-   return st->bitmap.vbuf_slot++ * sizeof st->bitmap.vertices;
-}
-
-
-
-/**
- * Render a glBitmap by drawing a textured quad
- */
-static void
-draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
-                 GLsizei width, GLsizei height,
-                 struct pipe_sampler_view *sv,
-                 const GLfloat *color)
-{
-   struct st_context *st = st_context(ctx);
-   struct pipe_context *pipe = st->pipe;
-   struct cso_context *cso = st->cso_context;
-   struct st_fp_variant *fpv;
-   struct st_fp_variant_key key;
-   GLuint maxSize;
-   GLuint offset;
-
-   memset(&key, 0, sizeof(key));
-   key.st = st;
-   key.bitmap = GL_TRUE;
-
-   fpv = st_get_fp_variant(st, st->fp, &key);
-
-   /* As an optimization, Mesa's fragment programs will sometimes get the
-    * primary color from a statevar/constant rather than a varying variable.
-    * when that's the case, we need to ensure that we use the 'color'
-    * parameter and not the current attribute color (which may have changed
-    * through glRasterPos and state validation.
-    * So, we force the proper color here.  Not elegant, but it works.
-    */
-   {
-      GLfloat colorSave[4];
-      COPY_4V(colorSave, ctx->Current.Attrib[VERT_ATTRIB_COLOR0]);
-      COPY_4V(ctx->Current.Attrib[VERT_ATTRIB_COLOR0], color);
-      st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
-      COPY_4V(ctx->Current.Attrib[VERT_ATTRIB_COLOR0], colorSave);
-   }
-
-
-   /* limit checks */
-   /* XXX if the bitmap is larger than the max texture size, break
-    * it up into chunks.
-    */
-   maxSize = 1 << (pipe->screen->get_param(pipe->screen,
-                                    PIPE_CAP_MAX_TEXTURE_2D_LEVELS) - 1);
-   assert(width <= (GLsizei)maxSize);
-   assert(height <= (GLsizei)maxSize);
-
-   cso_save_rasterizer(cso);
-   cso_save_samplers(cso);
-   cso_save_fragment_sampler_views(cso);
-   cso_save_viewport(cso);
-   cso_save_fragment_shader(cso);
-   cso_save_vertex_shader(cso);
-   cso_save_vertex_elements(cso);
-
-   /* rasterizer state: just scissor */
-   st->bitmap.rasterizer.scissor = ctx->Scissor.Enabled;
-   cso_set_rasterizer(cso, &st->bitmap.rasterizer);
-
-   /* fragment shader state: TEX lookup program */
-   cso_set_fragment_shader_handle(cso, fpv->driver_shader);
-
-   /* vertex shader state: position + texcoord pass-through */
-   cso_set_vertex_shader_handle(cso, st->bitmap.vs);
-
-   /* user samplers, plus our bitmap sampler */
-   {
-      struct pipe_sampler_state *samplers[PIPE_MAX_SAMPLERS];
-      uint num = MAX2(fpv->bitmap_sampler + 1, st->state.num_samplers);
-      uint i;
-      for (i = 0; i < st->state.num_samplers; i++) {
-         samplers[i] = &st->state.samplers[i];
-      }
-      samplers[fpv->bitmap_sampler] =
-         &st->bitmap.samplers[sv->texture->target != PIPE_TEXTURE_RECT];
-      cso_set_samplers(cso, num, (const struct pipe_sampler_state **) samplers);
-   }
-
-   /* user textures, plus the bitmap texture */
-   {
-      struct pipe_sampler_view *sampler_views[PIPE_MAX_SAMPLERS];
-      uint num = MAX2(fpv->bitmap_sampler + 1, st->state.num_textures);
-      memcpy(sampler_views, st->state.sampler_views, sizeof(sampler_views));
-      sampler_views[fpv->bitmap_sampler] = sv;
-      cso_set_fragment_sampler_views(cso, num, sampler_views);
-   }
-
-   /* viewport state: viewport matching window dims */
-   {
-      const struct gl_framebuffer *fb = st->ctx->DrawBuffer;
-      const GLboolean invert = (st_fb_orientation(fb) == Y_0_TOP);
-      const GLfloat width = (GLfloat)fb->Width;
-      const GLfloat height = (GLfloat)fb->Height;
-      struct pipe_viewport_state vp;
-      vp.scale[0] =  0.5f * width;
-      vp.scale[1] = height * (invert ? -0.5f : 0.5f);
-      vp.scale[2] = 0.5f;
-      vp.scale[3] = 1.0f;
-      vp.translate[0] = 0.5f * width;
-      vp.translate[1] = 0.5f * height;
-      vp.translate[2] = 0.5f;
-      vp.translate[3] = 0.0f;
-      cso_set_viewport(cso, &vp);
-   }
-
-   cso_set_vertex_elements(cso, 3, st->velems_util_draw);
-
-   /* convert Z from [0,1] to [-1,-1] to match viewport Z scale/bias */
-   z = z * 2.0 - 1.0;
-
-   /* draw textured quad */
-   offset = setup_bitmap_vertex_data(st,
-                                     sv->texture->target != PIPE_TEXTURE_RECT,
-                                     x, y, width, height, z, color);
-
-   util_draw_vertex_buffer(pipe, st->bitmap.vbuf, offset,
-                           PIPE_PRIM_TRIANGLE_FAN,
-                           4,  /* verts */
-                           3); /* attribs/vert */
-
-
-   /* restore state */
-   cso_restore_rasterizer(cso);
-   cso_restore_samplers(cso);
-   cso_restore_fragment_sampler_views(cso);
-   cso_restore_viewport(cso);
-   cso_restore_fragment_shader(cso);
-   cso_restore_vertex_shader(cso);
-   cso_restore_vertex_elements(cso);
-}
-
-
-static void
-reset_cache(struct st_context *st)
-{
-   struct pipe_context *pipe = st->pipe;
-   struct bitmap_cache *cache = st->bitmap.cache;
-
-   /*memset(cache->buffer, 0xff, sizeof(cache->buffer));*/
-   cache->empty = GL_TRUE;
-
-   cache->xmin = 1000000;
-   cache->xmax = -1000000;
-   cache->ymin = 1000000;
-   cache->ymax = -1000000;
-
-   if (cache->trans) {
-      pipe->transfer_destroy(pipe, cache->trans);
-      cache->trans = NULL;
-   }
-
-   assert(!cache->texture);
-
-   /* allocate a new texture */
-   cache->texture = st_texture_create(st, PIPE_TEXTURE_2D,
-                                      st->bitmap.tex_format, 0,
-                                      BITMAP_CACHE_WIDTH, BITMAP_CACHE_HEIGHT,
-                                      1, 1,
-				      PIPE_BIND_SAMPLER_VIEW);
-}
-
-
-/** Print bitmap image to stdout (debug) */
-static void
-print_cache(const struct bitmap_cache *cache)
-{
-   int i, j, k;
-
-   for (i = 0; i < BITMAP_CACHE_HEIGHT; i++) {
-      k = BITMAP_CACHE_WIDTH * (BITMAP_CACHE_HEIGHT - i - 1);
-      for (j = 0; j < BITMAP_CACHE_WIDTH; j++) {
-         if (cache->buffer[k])
-            printf("X");
-         else
-            printf(" ");
-         k++;
-      }
-      printf("\n");
-   }
-}
-
-
-/**
- * Create gallium pipe_transfer object for the bitmap cache.
- */
-static void
-create_cache_trans(struct st_context *st)
-{
-   struct pipe_context *pipe = st->pipe;
-   struct bitmap_cache *cache = st->bitmap.cache;
-
-   if (cache->trans)
-      return;
-
-   /* Map the texture transfer.
-    * Subsequent glBitmap calls will write into the texture image.
-    */
-   cache->trans = pipe_get_transfer(st->pipe, cache->texture, 0, 0,
-                                    PIPE_TRANSFER_WRITE, 0, 0,
-                                    BITMAP_CACHE_WIDTH,
-                                    BITMAP_CACHE_HEIGHT);
-   cache->buffer = pipe_transfer_map(pipe, cache->trans);
-
-   /* init image to all 0xff */
-   memset(cache->buffer, 0xff, cache->trans->stride * BITMAP_CACHE_HEIGHT);
-}
-
-
-/**
- * If there's anything in the bitmap cache, draw/flush it now.
- */
-void
-st_flush_bitmap_cache(struct st_context *st)
-{
-   if (!st->bitmap.cache->empty) {
-      struct bitmap_cache *cache = st->bitmap.cache;
-
-      if (st->ctx->DrawBuffer) {
-         struct pipe_context *pipe = st->pipe;
-         struct pipe_sampler_view *sv;
-
-         assert(cache->xmin <= cache->xmax);
- 
-/*         printf("flush size %d x %d  at %d, %d\n",
-                cache->xmax - cache->xmin,
-                cache->ymax - cache->ymin,
-                cache->xpos, cache->ypos);
-*/
-
-         /* The texture transfer has been mapped until now.
-          * So unmap and release the texture transfer before drawing.
-          */
-         if (cache->trans) {
-            if (0)
-               print_cache(cache);
-            pipe_transfer_unmap(pipe, cache->trans);
-            cache->buffer = NULL;
-
-            pipe->transfer_destroy(pipe, cache->trans);
-            cache->trans = NULL;
-         }
-
-         sv = st_create_texture_sampler_view(st->pipe, cache->texture);
-         if (sv) {
-            draw_bitmap_quad(st->ctx,
-                             cache->xpos,
-                             cache->ypos,
-                             cache->zpos,
-                             BITMAP_CACHE_WIDTH, BITMAP_CACHE_HEIGHT,
-                             sv,
-                             cache->color);
-
-            pipe_sampler_view_reference(&sv, NULL);
-         }
-      }
-
-      /* release/free the texture */
-      pipe_resource_reference(&cache->texture, NULL);
-
-      reset_cache(st);
-   }
-}
-
-
-/**
- * Flush bitmap cache and release vertex buffer.
- */
-void
-st_flush_bitmap( struct st_context *st )
-{
-   st_flush_bitmap_cache(st);
-
-   /* Release vertex buffer to avoid synchronous rendering if we were
-    * to map it in the next frame.
-    */
-   pipe_resource_reference(&st->bitmap.vbuf, NULL);
-   st->bitmap.vbuf_slot = 0;
-}
-
-
-/**
- * Try to accumulate this glBitmap call in the bitmap cache.
- * \return  GL_TRUE for success, GL_FALSE if bitmap is too large, etc.
- */
-static GLboolean
-accum_bitmap(struct st_context *st,
-             GLint x, GLint y, GLsizei width, GLsizei height,
-             const struct gl_pixelstore_attrib *unpack,
-             const GLubyte *bitmap )
-{
-   struct bitmap_cache *cache = st->bitmap.cache;
-   int px = -999, py = -999;
-   const GLfloat z = st->ctx->Current.RasterPos[2];
-
-   if (width > BITMAP_CACHE_WIDTH ||
-       height > BITMAP_CACHE_HEIGHT)
-      return GL_FALSE; /* too big to cache */
-
-   if (!cache->empty) {
-      px = x - cache->xpos;  /* pos in buffer */
-      py = y - cache->ypos;
-      if (px < 0 || px + width > BITMAP_CACHE_WIDTH ||
-          py < 0 || py + height > BITMAP_CACHE_HEIGHT ||
-          !TEST_EQ_4V(st->ctx->Current.RasterColor, cache->color) ||
-          ((fabs(z - cache->zpos) > Z_EPSILON))) {
-         /* This bitmap would extend beyond cache bounds, or the bitmap
-          * color is changing
-          * so flush and continue.
-          */
-         st_flush_bitmap_cache(st);
-      }
-   }
-
-   if (cache->empty) {
-      /* Initialize.  Center bitmap vertically in the buffer. */
-      px = 0;
-      py = (BITMAP_CACHE_HEIGHT - height) / 2;
-      cache->xpos = x;
-      cache->ypos = y - py;
-      cache->zpos = z;
-      cache->empty = GL_FALSE;
-      COPY_4FV(cache->color, st->ctx->Current.RasterColor);
-   }
-
-   assert(px != -999);
-   assert(py != -999);
-
-   if (x < cache->xmin)
-      cache->xmin = x;
-   if (y < cache->ymin)
-      cache->ymin = y;
-   if (x + width > cache->xmax)
-      cache->xmax = x + width;
-   if (y + height > cache->ymax)
-      cache->ymax = y + height;
-
-   /* create the transfer if needed */
-   create_cache_trans(st);
-
-   unpack_bitmap(st, px, py, width, height, unpack, bitmap,
-                 cache->buffer, BITMAP_CACHE_WIDTH);
-
-   return GL_TRUE; /* accumulated */
-}
-
-
-
-/**
- * Called via ctx->Driver.Bitmap()
- */
-static void
-st_Bitmap(struct gl_context *ctx, GLint x, GLint y,
-          GLsizei width, GLsizei height,
-          const struct gl_pixelstore_attrib *unpack, const GLubyte *bitmap )
-{
-   struct st_context *st = st_context(ctx);
-   struct pipe_resource *pt;
-
-   if (width == 0 || height == 0)
-      return;
-
-   st_validate_state(st);
-
-   if (!st->bitmap.vs) {
-      /* create pass-through vertex shader now */
-      const uint semantic_names[] = { TGSI_SEMANTIC_POSITION,
-                                      TGSI_SEMANTIC_COLOR,
-                                      TGSI_SEMANTIC_GENERIC };
-      const uint semantic_indexes[] = { 0, 0, 0 };
-      st->bitmap.vs = util_make_vertex_passthrough_shader(st->pipe, 3,
-                                                          semantic_names,
-                                                          semantic_indexes);
-   }
-
-   if (UseBitmapCache && accum_bitmap(st, x, y, width, height, unpack, bitmap))
-      return;
-
-   pt = make_bitmap_texture(ctx, width, height, unpack, bitmap);
-   if (pt) {
-      struct pipe_sampler_view *sv =
-         st_create_texture_sampler_view(st->pipe, pt);
-
-      assert(pt->target == PIPE_TEXTURE_2D || pt->target == PIPE_TEXTURE_RECT);
-
-      if (sv) {
-         draw_bitmap_quad(ctx, x, y, ctx->Current.RasterPos[2],
-                          width, height, sv,
-                          st->ctx->Current.RasterColor);
-
-         pipe_sampler_view_reference(&sv, NULL);
-      }
-
-      /* release/free the texture */
-      pipe_resource_reference(&pt, NULL);
-   }
-}
-
-
-/** Per-context init */
-void
-st_init_bitmap_functions(struct dd_function_table *functions)
-{
-   functions->Bitmap = st_Bitmap;
-}
-
-
-/** Per-context init */
-void
-st_init_bitmap(struct st_context *st)
-{
-   struct pipe_sampler_state *sampler = &st->bitmap.samplers[0];
-   struct pipe_context *pipe = st->pipe;
-   struct pipe_screen *screen = pipe->screen;
-
-   /* init sampler state once */
-   memset(sampler, 0, sizeof(*sampler));
-   sampler->wrap_s = PIPE_TEX_WRAP_CLAMP;
-   sampler->wrap_t = PIPE_TEX_WRAP_CLAMP;
-   sampler->wrap_r = PIPE_TEX_WRAP_CLAMP;
-   sampler->min_img_filter = PIPE_TEX_FILTER_NEAREST;
-   sampler->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
-   sampler->mag_img_filter = PIPE_TEX_FILTER_NEAREST;
-   st->bitmap.samplers[1] = *sampler;
-   st->bitmap.samplers[1].normalized_coords = 1;
-
-   /* init baseline rasterizer state once */
-   memset(&st->bitmap.rasterizer, 0, sizeof(st->bitmap.rasterizer));
-   st->bitmap.rasterizer.gl_rasterization_rules = 1;
-
-   /* find a usable texture format */
-   if (screen->is_format_supported(screen, PIPE_FORMAT_I8_UNORM,
-                                   PIPE_TEXTURE_2D, 0,
-                                   PIPE_BIND_SAMPLER_VIEW, 0)) {
-      st->bitmap.tex_format = PIPE_FORMAT_I8_UNORM;
-   }
-   else if (screen->is_format_supported(screen, PIPE_FORMAT_A8_UNORM,
-                                        PIPE_TEXTURE_2D, 0,
-                                        PIPE_BIND_SAMPLER_VIEW, 0)) {
-      st->bitmap.tex_format = PIPE_FORMAT_A8_UNORM;
-   }
-   else if (screen->is_format_supported(screen, PIPE_FORMAT_L8_UNORM,
-                                        PIPE_TEXTURE_2D, 0,
-                                        PIPE_BIND_SAMPLER_VIEW, 0)) {
-      st->bitmap.tex_format = PIPE_FORMAT_L8_UNORM;
-   }
-   else {
-      /* XXX support more formats */
-      assert(0);
-   }
-
-   /* alloc bitmap cache object */
-   st->bitmap.cache = ST_CALLOC_STRUCT(bitmap_cache);
-
-   reset_cache(st);
-}
-
-
-/** Per-context tear-down */
-void
-st_destroy_bitmap(struct st_context *st)
-{
-   struct pipe_context *pipe = st->pipe;
-   struct bitmap_cache *cache = st->bitmap.cache;
-
-   if (st->bitmap.vs) {
-      cso_delete_vertex_shader(st->cso_context, st->bitmap.vs);
-      st->bitmap.vs = NULL;
-   }
-
-   if (st->bitmap.vbuf) {
-      pipe_resource_reference(&st->bitmap.vbuf, NULL);
-      st->bitmap.vbuf = NULL;
-   }
-
-   if (cache) {
-      if (cache->trans) {
-         pipe_transfer_unmap(pipe, cache->trans);
-         pipe->transfer_destroy(pipe, cache->trans);
-      }
-      pipe_resource_reference(&st->bitmap.cache->texture, NULL);
-      free(st->bitmap.cache);
-      st->bitmap.cache = NULL;
-   }
-}
-
-#endif /* FEATURE_drawpix */
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Brian Paul
+  */
+
+#include "main/imports.h"
+#include "main/image.h"
+#include "main/bufferobj.h"
+#include "main/macros.h"
+#include "main/mfeatures.h"
+#include "program/program.h"
+#include "program/prog_print.h"
+
+#include "st_context.h"
+#include "st_atom.h"
+#include "st_atom_constbuf.h"
+#include "st_program.h"
+#include "st_cb_bitmap.h"
+#include "st_texture.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_inlines.h"
+#include "util/u_draw_quad.h"
+#include "util/u_simple_shaders.h"
+#include "program/prog_instruction.h"
+#include "cso_cache/cso_context.h"
+
+
+#if FEATURE_drawpix
+
+/**
+ * glBitmaps are drawn as textured quads.  The user's bitmap pattern
+ * is stored in a texture image.  An alpha8 texture format is used.
+ * The fragment shader samples a bit (texel) from the texture, then
+ * discards the fragment if the bit is off.
+ *
+ * Note that we actually store the inverse image of the bitmap to
+ * simplify the fragment program.  An "on" bit gets stored as texel=0x0
+ * and an "off" bit is stored as texel=0xff.  Then we kill the
+ * fragment if the negated texel value is less than zero.
+ */
+
+
+/**
+ * The bitmap cache attempts to accumulate multiple glBitmap calls in a
+ * buffer which is then rendered en mass upon a flush, state change, etc.
+ * A wide, short buffer is used to target the common case of a series
+ * of glBitmap calls being used to draw text.
+ */
+static GLboolean UseBitmapCache = GL_TRUE;
+
+
+#define BITMAP_CACHE_WIDTH  512
+#define BITMAP_CACHE_HEIGHT 32
+
+struct bitmap_cache
+{
+   /** Window pos to render the cached image */
+   GLint xpos, ypos;
+   /** Bounds of region used in window coords */
+   GLint xmin, ymin, xmax, ymax;
+
+   GLfloat color[4];
+
+   /** Bitmap's Z position */
+   GLfloat zpos;
+
+   struct pipe_resource *texture;
+   struct pipe_transfer *trans;
+
+   GLboolean empty;
+
+   /** An I8 texture image: */
+   ubyte *buffer;
+};
+
+
+/** Epsilon for Z comparisons */
+#define Z_EPSILON 1e-06
+
+
+/**
+ * Make fragment program for glBitmap:
+ *   Sample the texture and kill the fragment if the bit is 0.
+ * This program will be combined with the user's fragment program.
+ */
+static struct st_fragment_program *
+make_bitmap_fragment_program(struct gl_context *ctx, GLuint samplerIndex)
+{
+   struct st_context *st = st_context(ctx);
+   struct st_fragment_program *stfp;
+   struct gl_program *p;
+   GLuint ic = 0;
+
+   p = ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0);
+   if (!p)
+      return NULL;
+
+   p->NumInstructions = 3;
+
+   p->Instructions = _mesa_alloc_instructions(p->NumInstructions);
+   if (!p->Instructions) {
+      ctx->Driver.DeleteProgram(ctx, p);
+      return NULL;
+   }
+   _mesa_init_instructions(p->Instructions, p->NumInstructions);
+
+   /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */
+   p->Instructions[ic].Opcode = OPCODE_TEX;
+   p->Instructions[ic].DstReg.File = PROGRAM_TEMPORARY;
+   p->Instructions[ic].DstReg.Index = 0;
+   p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT;
+   p->Instructions[ic].SrcReg[0].Index = FRAG_ATTRIB_TEX0;
+   p->Instructions[ic].TexSrcUnit = samplerIndex;
+   p->Instructions[ic].TexSrcTarget = TEXTURE_2D_INDEX;
+   ic++;
+
+   /* KIL if -tmp0 < 0 # texel=0 -> keep / texel=0 -> discard */
+   p->Instructions[ic].Opcode = OPCODE_KIL;
+   p->Instructions[ic].SrcReg[0].File = PROGRAM_TEMPORARY;
+
+   if (st->bitmap.tex_format == PIPE_FORMAT_L8_UNORM)
+      p->Instructions[ic].SrcReg[0].Swizzle = SWIZZLE_XXXX;
+
+   p->Instructions[ic].SrcReg[0].Index = 0;
+   p->Instructions[ic].SrcReg[0].Negate = NEGATE_XYZW;
+   ic++;
+
+   /* END; */
+   p->Instructions[ic++].Opcode = OPCODE_END;
+
+   assert(ic == p->NumInstructions);
+
+   p->InputsRead = FRAG_BIT_TEX0;
+   p->OutputsWritten = 0x0;
+   p->SamplersUsed = (1 << samplerIndex);
+
+   stfp = (struct st_fragment_program *) p;
+   stfp->Base.UsesKill = GL_TRUE;
+
+   return stfp;
+}
+
+
+static int
+find_free_bit(uint bitfield)
+{
+   int i;
+   for (i = 0; i < 32; i++) {
+      if ((bitfield & (1 << i)) == 0) {
+         return i;
+      }
+   }
+   return -1;
+}
+
+
+/**
+ * Combine basic bitmap fragment program with the user-defined program.
+ * \param st  current context
+ * \param fpIn  the incoming fragment program
+ * \param fpOut  the new fragment program which does fragment culling
+ * \param bitmap_sampler  sampler number for the bitmap texture
+ */
+void
+st_make_bitmap_fragment_program(struct st_context *st,
+                                struct gl_fragment_program *fpIn,
+                                struct gl_fragment_program **fpOut,
+                                GLuint *bitmap_sampler)
+{
+   struct st_fragment_program *bitmap_prog;
+   struct gl_program *newProg;
+   uint sampler;
+
+   /*
+    * Generate new program which is the user-defined program prefixed
+    * with the bitmap sampler/kill instructions.
+    */
+   sampler = find_free_bit(fpIn->Base.SamplersUsed);
+   bitmap_prog = make_bitmap_fragment_program(st->ctx, sampler);
+
+   newProg = _mesa_combine_programs(st->ctx,
+                                    &bitmap_prog->Base.Base,
+                                    &fpIn->Base);
+   /* done with this after combining */
+   st_reference_fragprog(st, &bitmap_prog, NULL);
+
+#if 0
+   {
+      printf("Combined bitmap program:\n");
+      _mesa_print_program(newProg);
+      printf("InputsRead: 0x%x\n", newProg->InputsRead);
+      printf("OutputsWritten: 0x%x\n", newProg->OutputsWritten);
+      _mesa_print_parameter_list(newProg->Parameters);
+   }
+#endif
+
+   /* return results */
+   *fpOut = (struct gl_fragment_program *) newProg;
+   *bitmap_sampler = sampler;
+}
+
+
+/**
+ * Copy user-provide bitmap bits into texture buffer, expanding
+ * bits into texels.
+ * "On" bits will set texels to 0x0.
+ * "Off" bits will not modify texels.
+ * Note that the image is actually going to be upside down in
+ * the texture.  We deal with that with texcoords.
+ */
+static void
+unpack_bitmap(struct st_context *st,
+              GLint px, GLint py, GLsizei width, GLsizei height,
+              const struct gl_pixelstore_attrib *unpack,
+              const GLubyte *bitmap,
+              ubyte *destBuffer, uint destStride)
+{
+   destBuffer += py * destStride + px;
+
+   _mesa_expand_bitmap(width, height, unpack, bitmap,
+                       destBuffer, destStride, 0x0);
+}
+
+
+/**
+ * Create a texture which represents a bitmap image.
+ */
+static struct pipe_resource *
+make_bitmap_texture(struct gl_context *ctx, GLsizei width, GLsizei height,
+                    const struct gl_pixelstore_attrib *unpack,
+                    const GLubyte *bitmap)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_transfer *transfer;
+   ubyte *dest;
+   struct pipe_resource *pt;
+
+   /* PBO source... */
+   bitmap = _mesa_map_pbo_source(ctx, unpack, bitmap);
+   if (!bitmap) {
+      return NULL;
+   }
+
+   /**
+    * Create texture to hold bitmap pattern.
+    */
+   pt = st_texture_create(st, st->internal_target, st->bitmap.tex_format,
+                          0, width, height, 1, 1,
+                          PIPE_BIND_SAMPLER_VIEW);
+   if (!pt) {
+      _mesa_unmap_pbo_source(ctx, unpack);
+      return NULL;
+   }
+
+   transfer = pipe_get_transfer(st->pipe, pt, 0, 0,
+                                PIPE_TRANSFER_WRITE,
+                                0, 0, width, height);
+
+   dest = pipe_transfer_map(pipe, transfer);
+
+   /* Put image into texture transfer */
+   memset(dest, 0xff, height * transfer->stride);
+   unpack_bitmap(st, 0, 0, width, height, unpack, bitmap,
+                 dest, transfer->stride);
+
+   _mesa_unmap_pbo_source(ctx, unpack);
+
+   /* Release transfer */
+   pipe_transfer_unmap(pipe, transfer);
+   pipe->transfer_destroy(pipe, transfer);
+
+   return pt;
+}
+
+static GLuint
+setup_bitmap_vertex_data(struct st_context *st, bool normalized,
+                         int x, int y, int width, int height,
+                         float z, const float color[4])
+{
+   struct pipe_context *pipe = st->pipe;
+   const struct gl_framebuffer *fb = st->ctx->DrawBuffer;
+   const GLfloat fb_width = (GLfloat)fb->Width;
+   const GLfloat fb_height = (GLfloat)fb->Height;
+   const GLfloat x0 = (GLfloat)x;
+   const GLfloat x1 = (GLfloat)(x + width);
+   const GLfloat y0 = (GLfloat)y;
+   const GLfloat y1 = (GLfloat)(y + height);
+   GLfloat sLeft = (GLfloat)0.0, sRight = (GLfloat)1.0;
+   GLfloat tTop = (GLfloat)0.0, tBot = (GLfloat)1.0 - tTop;
+   const GLfloat clip_x0 = (GLfloat)(x0 / fb_width * 2.0 - 1.0);
+   const GLfloat clip_y0 = (GLfloat)(y0 / fb_height * 2.0 - 1.0);
+   const GLfloat clip_x1 = (GLfloat)(x1 / fb_width * 2.0 - 1.0);
+   const GLfloat clip_y1 = (GLfloat)(y1 / fb_height * 2.0 - 1.0);
+   const GLuint max_slots = 1; /* 4096 / sizeof(st->bitmap.vertices); */
+   GLuint i;
+
+   if(!normalized)
+   {
+      sRight = width;
+      tBot = height;
+   }
+
+   /* XXX: Need to improve buffer_write to allow NO_WAIT (as well as
+    * no_flush) updates to buffers where we know there is no conflict
+    * with previous data.  Currently using max_slots > 1 will cause
+    * synchronous rendering if the driver flushes its command buffers
+    * between one bitmap and the next.  Our flush hook below isn't
+    * sufficient to catch this as the driver doesn't tell us when it
+    * flushes its own command buffers.  Until this gets fixed, pay the
+    * price of allocating a new buffer for each bitmap cache-flush to
+    * avoid synchronous rendering.
+    */
+   if (st->bitmap.vbuf_slot >= max_slots) {
+      pipe_resource_reference(&st->bitmap.vbuf, NULL);
+      st->bitmap.vbuf_slot = 0;
+   }
+
+   if (!st->bitmap.vbuf) {
+      st->bitmap.vbuf = pipe_buffer_create(pipe->screen, 
+                                           PIPE_BIND_VERTEX_BUFFER,
+                                           PIPE_USAGE_STREAM,
+                                           max_slots *
+                                           sizeof(st->bitmap.vertices));
+   }
+
+   /* Positions are in clip coords since we need to do clipping in case
+    * the bitmap quad goes beyond the window bounds.
+    */
+   st->bitmap.vertices[0][0][0] = clip_x0;
+   st->bitmap.vertices[0][0][1] = clip_y0;
+   st->bitmap.vertices[0][2][0] = sLeft;
+   st->bitmap.vertices[0][2][1] = tTop;
+
+   st->bitmap.vertices[1][0][0] = clip_x1;
+   st->bitmap.vertices[1][0][1] = clip_y0;
+   st->bitmap.vertices[1][2][0] = sRight;
+   st->bitmap.vertices[1][2][1] = tTop;
+   
+   st->bitmap.vertices[2][0][0] = clip_x1;
+   st->bitmap.vertices[2][0][1] = clip_y1;
+   st->bitmap.vertices[2][2][0] = sRight;
+   st->bitmap.vertices[2][2][1] = tBot;
+   
+   st->bitmap.vertices[3][0][0] = clip_x0;
+   st->bitmap.vertices[3][0][1] = clip_y1;
+   st->bitmap.vertices[3][2][0] = sLeft;
+   st->bitmap.vertices[3][2][1] = tBot;
+   
+   /* same for all verts: */
+   for (i = 0; i < 4; i++) {
+      st->bitmap.vertices[i][0][2] = z;
+      st->bitmap.vertices[i][0][3] = 1.0;
+      st->bitmap.vertices[i][1][0] = color[0];
+      st->bitmap.vertices[i][1][1] = color[1];
+      st->bitmap.vertices[i][1][2] = color[2];
+      st->bitmap.vertices[i][1][3] = color[3];
+      st->bitmap.vertices[i][2][2] = 0.0; /*R*/
+      st->bitmap.vertices[i][2][3] = 1.0; /*Q*/
+   }
+
+   /* put vertex data into vbuf */
+   pipe_buffer_write_nooverlap(st->pipe,
+                               st->bitmap.vbuf,
+                               st->bitmap.vbuf_slot
+                               * sizeof(st->bitmap.vertices),
+                               sizeof st->bitmap.vertices,
+                               st->bitmap.vertices);
+
+   return st->bitmap.vbuf_slot++ * sizeof st->bitmap.vertices;
+}
+
+
+
+/**
+ * Render a glBitmap by drawing a textured quad
+ */
+static void
+draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
+                 GLsizei width, GLsizei height,
+                 struct pipe_sampler_view *sv,
+                 const GLfloat *color)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+   struct cso_context *cso = st->cso_context;
+   struct st_fp_variant *fpv;
+   struct st_fp_variant_key key;
+   GLuint maxSize;
+   GLuint offset;
+
+   memset(&key, 0, sizeof(key));
+   key.st = st;
+   key.bitmap = GL_TRUE;
+
+   fpv = st_get_fp_variant(st, st->fp, &key);
+
+   /* As an optimization, Mesa's fragment programs will sometimes get the
+    * primary color from a statevar/constant rather than a varying variable.
+    * when that's the case, we need to ensure that we use the 'color'
+    * parameter and not the current attribute color (which may have changed
+    * through glRasterPos and state validation.
+    * So, we force the proper color here.  Not elegant, but it works.
+    */
+   {
+      GLfloat colorSave[4];
+      COPY_4V(colorSave, ctx->Current.Attrib[VERT_ATTRIB_COLOR0]);
+      COPY_4V(ctx->Current.Attrib[VERT_ATTRIB_COLOR0], color);
+      st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
+      COPY_4V(ctx->Current.Attrib[VERT_ATTRIB_COLOR0], colorSave);
+   }
+
+
+   /* limit checks */
+   /* XXX if the bitmap is larger than the max texture size, break
+    * it up into chunks.
+    */
+   maxSize = 1 << (pipe->screen->get_param(pipe->screen,
+                                    PIPE_CAP_MAX_TEXTURE_2D_LEVELS) - 1);
+   assert(width <= (GLsizei)maxSize);
+   assert(height <= (GLsizei)maxSize);
+
+   cso_save_rasterizer(cso);
+   cso_save_samplers(cso);
+   cso_save_fragment_sampler_views(cso);
+   cso_save_viewport(cso);
+   cso_save_fragment_shader(cso);
+   cso_save_vertex_shader(cso);
+   cso_save_vertex_elements(cso);
+   cso_save_vertex_buffers(cso);
+
+   /* rasterizer state: just scissor */
+   st->bitmap.rasterizer.scissor = ctx->Scissor.Enabled;
+   cso_set_rasterizer(cso, &st->bitmap.rasterizer);
+
+   /* fragment shader state: TEX lookup program */
+   cso_set_fragment_shader_handle(cso, fpv->driver_shader);
+
+   /* vertex shader state: position + texcoord pass-through */
+   cso_set_vertex_shader_handle(cso, st->bitmap.vs);
+
+   /* user samplers, plus our bitmap sampler */
+   {
+      struct pipe_sampler_state *samplers[PIPE_MAX_SAMPLERS];
+      uint num = MAX2(fpv->bitmap_sampler + 1, st->state.num_samplers);
+      uint i;
+      for (i = 0; i < st->state.num_samplers; i++) {
+         samplers[i] = &st->state.samplers[i];
+      }
+      samplers[fpv->bitmap_sampler] =
+         &st->bitmap.samplers[sv->texture->target != PIPE_TEXTURE_RECT];
+      cso_set_samplers(cso, num, (const struct pipe_sampler_state **) samplers);
+   }
+
+   /* user textures, plus the bitmap texture */
+   {
+      struct pipe_sampler_view *sampler_views[PIPE_MAX_SAMPLERS];
+      uint num = MAX2(fpv->bitmap_sampler + 1, st->state.num_textures);
+      memcpy(sampler_views, st->state.sampler_views, sizeof(sampler_views));
+      sampler_views[fpv->bitmap_sampler] = sv;
+      cso_set_fragment_sampler_views(cso, num, sampler_views);
+   }
+
+   /* viewport state: viewport matching window dims */
+   {
+      const struct gl_framebuffer *fb = st->ctx->DrawBuffer;
+      const GLboolean invert = (st_fb_orientation(fb) == Y_0_TOP);
+      const GLfloat width = (GLfloat)fb->Width;
+      const GLfloat height = (GLfloat)fb->Height;
+      struct pipe_viewport_state vp;
+      vp.scale[0] =  0.5f * width;
+      vp.scale[1] = height * (invert ? -0.5f : 0.5f);
+      vp.scale[2] = 0.5f;
+      vp.scale[3] = 1.0f;
+      vp.translate[0] = 0.5f * width;
+      vp.translate[1] = 0.5f * height;
+      vp.translate[2] = 0.5f;
+      vp.translate[3] = 0.0f;
+      cso_set_viewport(cso, &vp);
+   }
+
+   cso_set_vertex_elements(cso, 3, st->velems_util_draw);
+
+   /* convert Z from [0,1] to [-1,-1] to match viewport Z scale/bias */
+   z = z * 2.0 - 1.0;
+
+   /* draw textured quad */
+   offset = setup_bitmap_vertex_data(st,
+                                     sv->texture->target != PIPE_TEXTURE_RECT,
+                                     x, y, width, height, z, color);
+
+   util_draw_vertex_buffer(pipe, st->cso_context, st->bitmap.vbuf, offset,
+                           PIPE_PRIM_TRIANGLE_FAN,
+                           4,  /* verts */
+                           3); /* attribs/vert */
+
+
+   /* restore state */
+   cso_restore_rasterizer(cso);
+   cso_restore_samplers(cso);
+   cso_restore_fragment_sampler_views(cso);
+   cso_restore_viewport(cso);
+   cso_restore_fragment_shader(cso);
+   cso_restore_vertex_shader(cso);
+   cso_restore_vertex_elements(cso);
+   cso_restore_vertex_buffers(cso);
+}
+
+
+static void
+reset_cache(struct st_context *st)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct bitmap_cache *cache = st->bitmap.cache;
+
+   /*memset(cache->buffer, 0xff, sizeof(cache->buffer));*/
+   cache->empty = GL_TRUE;
+
+   cache->xmin = 1000000;
+   cache->xmax = -1000000;
+   cache->ymin = 1000000;
+   cache->ymax = -1000000;
+
+   if (cache->trans) {
+      pipe->transfer_destroy(pipe, cache->trans);
+      cache->trans = NULL;
+   }
+
+   assert(!cache->texture);
+
+   /* allocate a new texture */
+   cache->texture = st_texture_create(st, PIPE_TEXTURE_2D,
+                                      st->bitmap.tex_format, 0,
+                                      BITMAP_CACHE_WIDTH, BITMAP_CACHE_HEIGHT,
+                                      1, 1,
+				      PIPE_BIND_SAMPLER_VIEW);
+}
+
+
+/** Print bitmap image to stdout (debug) */
+static void
+print_cache(const struct bitmap_cache *cache)
+{
+   int i, j, k;
+
+   for (i = 0; i < BITMAP_CACHE_HEIGHT; i++) {
+      k = BITMAP_CACHE_WIDTH * (BITMAP_CACHE_HEIGHT - i - 1);
+      for (j = 0; j < BITMAP_CACHE_WIDTH; j++) {
+         if (cache->buffer[k])
+            printf("X");
+         else
+            printf(" ");
+         k++;
+      }
+      printf("\n");
+   }
+}
+
+
+/**
+ * Create gallium pipe_transfer object for the bitmap cache.
+ */
+static void
+create_cache_trans(struct st_context *st)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct bitmap_cache *cache = st->bitmap.cache;
+
+   if (cache->trans)
+      return;
+
+   /* Map the texture transfer.
+    * Subsequent glBitmap calls will write into the texture image.
+    */
+   cache->trans = pipe_get_transfer(st->pipe, cache->texture, 0, 0,
+                                    PIPE_TRANSFER_WRITE, 0, 0,
+                                    BITMAP_CACHE_WIDTH,
+                                    BITMAP_CACHE_HEIGHT);
+   cache->buffer = pipe_transfer_map(pipe, cache->trans);
+
+   /* init image to all 0xff */
+   memset(cache->buffer, 0xff, cache->trans->stride * BITMAP_CACHE_HEIGHT);
+}
+
+
+/**
+ * If there's anything in the bitmap cache, draw/flush it now.
+ */
+void
+st_flush_bitmap_cache(struct st_context *st)
+{
+   if (!st->bitmap.cache->empty) {
+      struct bitmap_cache *cache = st->bitmap.cache;
+
+      if (st->ctx->DrawBuffer) {
+         struct pipe_context *pipe = st->pipe;
+         struct pipe_sampler_view *sv;
+
+         assert(cache->xmin <= cache->xmax);
+ 
+/*         printf("flush size %d x %d  at %d, %d\n",
+                cache->xmax - cache->xmin,
+                cache->ymax - cache->ymin,
+                cache->xpos, cache->ypos);
+*/
+
+         /* The texture transfer has been mapped until now.
+          * So unmap and release the texture transfer before drawing.
+          */
+         if (cache->trans) {
+            if (0)
+               print_cache(cache);
+            pipe_transfer_unmap(pipe, cache->trans);
+            cache->buffer = NULL;
+
+            pipe->transfer_destroy(pipe, cache->trans);
+            cache->trans = NULL;
+         }
+
+         sv = st_create_texture_sampler_view(st->pipe, cache->texture);
+         if (sv) {
+            draw_bitmap_quad(st->ctx,
+                             cache->xpos,
+                             cache->ypos,
+                             cache->zpos,
+                             BITMAP_CACHE_WIDTH, BITMAP_CACHE_HEIGHT,
+                             sv,
+                             cache->color);
+
+            pipe_sampler_view_reference(&sv, NULL);
+         }
+      }
+
+      /* release/free the texture */
+      pipe_resource_reference(&cache->texture, NULL);
+
+      reset_cache(st);
+   }
+}
+
+
+/**
+ * Flush bitmap cache and release vertex buffer.
+ */
+void
+st_flush_bitmap( struct st_context *st )
+{
+   st_flush_bitmap_cache(st);
+
+   /* Release vertex buffer to avoid synchronous rendering if we were
+    * to map it in the next frame.
+    */
+   pipe_resource_reference(&st->bitmap.vbuf, NULL);
+   st->bitmap.vbuf_slot = 0;
+}
+
+
+/**
+ * Try to accumulate this glBitmap call in the bitmap cache.
+ * \return  GL_TRUE for success, GL_FALSE if bitmap is too large, etc.
+ */
+static GLboolean
+accum_bitmap(struct st_context *st,
+             GLint x, GLint y, GLsizei width, GLsizei height,
+             const struct gl_pixelstore_attrib *unpack,
+             const GLubyte *bitmap )
+{
+   struct bitmap_cache *cache = st->bitmap.cache;
+   int px = -999, py = -999;
+   const GLfloat z = st->ctx->Current.RasterPos[2];
+
+   if (width > BITMAP_CACHE_WIDTH ||
+       height > BITMAP_CACHE_HEIGHT)
+      return GL_FALSE; /* too big to cache */
+
+   if (!cache->empty) {
+      px = x - cache->xpos;  /* pos in buffer */
+      py = y - cache->ypos;
+      if (px < 0 || px + width > BITMAP_CACHE_WIDTH ||
+          py < 0 || py + height > BITMAP_CACHE_HEIGHT ||
+          !TEST_EQ_4V(st->ctx->Current.RasterColor, cache->color) ||
+          ((fabs(z - cache->zpos) > Z_EPSILON))) {
+         /* This bitmap would extend beyond cache bounds, or the bitmap
+          * color is changing
+          * so flush and continue.
+          */
+         st_flush_bitmap_cache(st);
+      }
+   }
+
+   if (cache->empty) {
+      /* Initialize.  Center bitmap vertically in the buffer. */
+      px = 0;
+      py = (BITMAP_CACHE_HEIGHT - height) / 2;
+      cache->xpos = x;
+      cache->ypos = y - py;
+      cache->zpos = z;
+      cache->empty = GL_FALSE;
+      COPY_4FV(cache->color, st->ctx->Current.RasterColor);
+   }
+
+   assert(px != -999);
+   assert(py != -999);
+
+   if (x < cache->xmin)
+      cache->xmin = x;
+   if (y < cache->ymin)
+      cache->ymin = y;
+   if (x + width > cache->xmax)
+      cache->xmax = x + width;
+   if (y + height > cache->ymax)
+      cache->ymax = y + height;
+
+   /* create the transfer if needed */
+   create_cache_trans(st);
+
+   unpack_bitmap(st, px, py, width, height, unpack, bitmap,
+                 cache->buffer, BITMAP_CACHE_WIDTH);
+
+   return GL_TRUE; /* accumulated */
+}
+
+
+
+/**
+ * Called via ctx->Driver.Bitmap()
+ */
+static void
+st_Bitmap(struct gl_context *ctx, GLint x, GLint y,
+          GLsizei width, GLsizei height,
+          const struct gl_pixelstore_attrib *unpack, const GLubyte *bitmap )
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_resource *pt;
+
+   if (width == 0 || height == 0)
+      return;
+
+   st_validate_state(st);
+
+   if (!st->bitmap.vs) {
+      /* create pass-through vertex shader now */
+      const uint semantic_names[] = { TGSI_SEMANTIC_POSITION,
+                                      TGSI_SEMANTIC_COLOR,
+                                      TGSI_SEMANTIC_GENERIC };
+      const uint semantic_indexes[] = { 0, 0, 0 };
+      st->bitmap.vs = util_make_vertex_passthrough_shader(st->pipe, 3,
+                                                          semantic_names,
+                                                          semantic_indexes);
+   }
+
+   if (UseBitmapCache && accum_bitmap(st, x, y, width, height, unpack, bitmap))
+      return;
+
+   pt = make_bitmap_texture(ctx, width, height, unpack, bitmap);
+   if (pt) {
+      struct pipe_sampler_view *sv =
+         st_create_texture_sampler_view(st->pipe, pt);
+
+      assert(pt->target == PIPE_TEXTURE_2D || pt->target == PIPE_TEXTURE_RECT);
+
+      if (sv) {
+         draw_bitmap_quad(ctx, x, y, ctx->Current.RasterPos[2],
+                          width, height, sv,
+                          st->ctx->Current.RasterColor);
+
+         pipe_sampler_view_reference(&sv, NULL);
+      }
+
+      /* release/free the texture */
+      pipe_resource_reference(&pt, NULL);
+   }
+}
+
+
+/** Per-context init */
+void
+st_init_bitmap_functions(struct dd_function_table *functions)
+{
+   functions->Bitmap = st_Bitmap;
+}
+
+
+/** Per-context init */
+void
+st_init_bitmap(struct st_context *st)
+{
+   struct pipe_sampler_state *sampler = &st->bitmap.samplers[0];
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_screen *screen = pipe->screen;
+
+   /* init sampler state once */
+   memset(sampler, 0, sizeof(*sampler));
+   sampler->wrap_s = PIPE_TEX_WRAP_CLAMP;
+   sampler->wrap_t = PIPE_TEX_WRAP_CLAMP;
+   sampler->wrap_r = PIPE_TEX_WRAP_CLAMP;
+   sampler->min_img_filter = PIPE_TEX_FILTER_NEAREST;
+   sampler->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+   sampler->mag_img_filter = PIPE_TEX_FILTER_NEAREST;
+   st->bitmap.samplers[1] = *sampler;
+   st->bitmap.samplers[1].normalized_coords = 1;
+
+   /* init baseline rasterizer state once */
+   memset(&st->bitmap.rasterizer, 0, sizeof(st->bitmap.rasterizer));
+   st->bitmap.rasterizer.gl_rasterization_rules = 1;
+
+   /* find a usable texture format */
+   if (screen->is_format_supported(screen, PIPE_FORMAT_I8_UNORM,
+                                   PIPE_TEXTURE_2D, 0,
+                                   PIPE_BIND_SAMPLER_VIEW, 0)) {
+      st->bitmap.tex_format = PIPE_FORMAT_I8_UNORM;
+   }
+   else if (screen->is_format_supported(screen, PIPE_FORMAT_A8_UNORM,
+                                        PIPE_TEXTURE_2D, 0,
+                                        PIPE_BIND_SAMPLER_VIEW, 0)) {
+      st->bitmap.tex_format = PIPE_FORMAT_A8_UNORM;
+   }
+   else if (screen->is_format_supported(screen, PIPE_FORMAT_L8_UNORM,
+                                        PIPE_TEXTURE_2D, 0,
+                                        PIPE_BIND_SAMPLER_VIEW, 0)) {
+      st->bitmap.tex_format = PIPE_FORMAT_L8_UNORM;
+   }
+   else {
+      /* XXX support more formats */
+      assert(0);
+   }
+
+   /* alloc bitmap cache object */
+   st->bitmap.cache = ST_CALLOC_STRUCT(bitmap_cache);
+
+   reset_cache(st);
+}
+
+
+/** Per-context tear-down */
+void
+st_destroy_bitmap(struct st_context *st)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct bitmap_cache *cache = st->bitmap.cache;
+
+   if (st->bitmap.vs) {
+      cso_delete_vertex_shader(st->cso_context, st->bitmap.vs);
+      st->bitmap.vs = NULL;
+   }
+
+   if (st->bitmap.vbuf) {
+      pipe_resource_reference(&st->bitmap.vbuf, NULL);
+      st->bitmap.vbuf = NULL;
+   }
+
+   if (cache) {
+      if (cache->trans) {
+         pipe_transfer_unmap(pipe, cache->trans);
+         pipe->transfer_destroy(pipe, cache->trans);
+      }
+      pipe_resource_reference(&st->bitmap.cache->texture, NULL);
+      free(st->bitmap.cache);
+      st->bitmap.cache = NULL;
+   }
+}
+
+#endif /* FEATURE_drawpix */
diff --git a/mesalib/src/mesa/state_tracker/st_cb_bufferobjects.c b/mesalib/src/mesa/state_tracker/st_cb_bufferobjects.c
index ba8a8cf89..12528f49f 100644
--- a/mesalib/src/mesa/state_tracker/st_cb_bufferobjects.c
+++ b/mesalib/src/mesa/state_tracker/st_cb_bufferobjects.c
@@ -1,447 +1,468 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-/**
- * Functions for pixel buffer objects and vertex/element buffer objects.
- */
-
-
-#include "main/imports.h"
-#include "main/mtypes.h"
-#include "main/arrayobj.h"
-#include "main/bufferobj.h"
-
-#include "st_context.h"
-#include "st_cb_bufferobjects.h"
-
-#include "pipe/p_context.h"
-#include "pipe/p_defines.h"
-#include "util/u_inlines.h"
-
-
-/**
- * There is some duplication between mesa's bufferobjects and our
- * bufmgr buffers.  Both have an integer handle and a hashtable to
- * lookup an opaque structure.  It would be nice if the handles and
- * internal structure where somehow shared.
- */
-static struct gl_buffer_object *
-st_bufferobj_alloc(struct gl_context *ctx, GLuint name, GLenum target)
-{
-   struct st_buffer_object *st_obj = ST_CALLOC_STRUCT(st_buffer_object);
-
-   if (!st_obj)
-      return NULL;
-
-   _mesa_initialize_buffer_object(&st_obj->Base, name, target);
-
-   return &st_obj->Base;
-}
-
-
-
-/**
- * Deallocate/free a vertex/pixel buffer object.
- * Called via glDeleteBuffersARB().
- */
-static void
-st_bufferobj_free(struct gl_context *ctx, struct gl_buffer_object *obj)
-{
-   struct st_buffer_object *st_obj = st_buffer_object(obj);
-
-   assert(obj->RefCount == 0);
-   assert(st_obj->transfer == NULL);
-
-   if (st_obj->buffer) 
-      pipe_resource_reference(&st_obj->buffer, NULL);
-
-   free(st_obj);
-}
-
-
-
-/**
- * Replace data in a subrange of buffer object.  If the data range
- * specified by size + offset extends beyond the end of the buffer or
- * if data is NULL, no copy is performed.
- * Called via glBufferSubDataARB().
- */
-static void
-st_bufferobj_subdata(struct gl_context *ctx,
-		     GLenum target,
-		     GLintptrARB offset,
-		     GLsizeiptrARB size,
-		     const GLvoid * data, struct gl_buffer_object *obj)
-{
-   struct st_buffer_object *st_obj = st_buffer_object(obj);
-
-   /* we may be called from VBO code, so double-check params here */
-   ASSERT(offset >= 0);
-   ASSERT(size >= 0);
-   ASSERT(offset + size <= obj->Size);
-
-   if (!size)
-      return;
-
-   /*
-    * According to ARB_vertex_buffer_object specification, if data is null,
-    * then the contents of the buffer object's data store is undefined. We just
-    * ignore, and leave it unchanged.
-    */
-   if (!data)
-      return;
-
-   /* Now that transfers are per-context, we don't have to figure out
-    * flushing here.  Usually drivers won't need to flush in this case
-    * even if the buffer is currently referenced by hardware - they
-    * just queue the upload as dma rather than mapping the underlying
-    * buffer directly.
-    */
-   pipe_buffer_write(st_context(ctx)->pipe,
-		     st_obj->buffer,
-		     offset, size, data);
-}
-
-
-/**
- * Called via glGetBufferSubDataARB().
- */
-static void
-st_bufferobj_get_subdata(struct gl_context *ctx,
-                         GLenum target,
-                         GLintptrARB offset,
-                         GLsizeiptrARB size,
-                         GLvoid * data, struct gl_buffer_object *obj)
-{
-   struct st_buffer_object *st_obj = st_buffer_object(obj);
-
-   /* we may be called from VBO code, so double-check params here */
-   ASSERT(offset >= 0);
-   ASSERT(size >= 0);
-   ASSERT(offset + size <= obj->Size);
-
-   if (!size)
-      return;
-
-   pipe_buffer_read(st_context(ctx)->pipe, st_obj->buffer,
-                    offset, size, data);
-}
-
-
-/**
- * Allocate space for and store data in a buffer object.  Any data that was
- * previously stored in the buffer object is lost.  If data is NULL,
- * memory will be allocated, but no copy will occur.
- * Called via ctx->Driver.BufferData().
- * \return GL_TRUE for success, GL_FALSE if out of memory
- */
-static GLboolean
-st_bufferobj_data(struct gl_context *ctx,
-		  GLenum target,
-		  GLsizeiptrARB size,
-		  const GLvoid * data,
-		  GLenum usage, 
-		  struct gl_buffer_object *obj)
-{
-   struct st_context *st = st_context(ctx);
-   struct pipe_context *pipe = st->pipe;
-   struct st_buffer_object *st_obj = st_buffer_object(obj);
-   unsigned buffer_usage;
-
-   st_obj->Base.Size = size;
-   st_obj->Base.Usage = usage;
-   
-   switch(target) {
-   case GL_PIXEL_PACK_BUFFER_ARB:
-   case GL_PIXEL_UNPACK_BUFFER_ARB:
-      buffer_usage = PIPE_BIND_RENDER_TARGET;
-      break;
-   case GL_ARRAY_BUFFER_ARB:
-      buffer_usage = PIPE_BIND_VERTEX_BUFFER;
-      break;
-   case GL_ELEMENT_ARRAY_BUFFER_ARB:
-      buffer_usage = PIPE_BIND_INDEX_BUFFER;
-      break;
-   default:
-      buffer_usage = 0;
-   }
-
-   pipe_resource_reference( &st_obj->buffer, NULL );
-
-   if (size != 0) {
-      st_obj->buffer = pipe_buffer_create(pipe->screen, buffer_usage, size);
-
-      if (!st_obj->buffer) {
-         return GL_FALSE;
-      }
-
-      if (data)
-         pipe_buffer_write(st_context(ctx)->pipe, st_obj->buffer, 0,
-				       size, data);
-      return GL_TRUE;
-   }
-
-   return GL_TRUE;
-}
-
-
-/**
- * Dummy data whose's pointer is used for zero size buffers or ranges.
- */
-static long st_bufferobj_zero_length = 0;
-
-
-
-/**
- * Called via glMapBufferARB().
- */
-static void *
-st_bufferobj_map(struct gl_context *ctx, GLenum target, GLenum access,
-                 struct gl_buffer_object *obj)
-{
-   struct st_buffer_object *st_obj = st_buffer_object(obj);
-   uint flags;
-
-   switch (access) {
-   case GL_WRITE_ONLY:
-      flags = PIPE_TRANSFER_WRITE;
-      break;
-   case GL_READ_ONLY:
-      flags = PIPE_TRANSFER_READ;
-      break;
-   case GL_READ_WRITE:
-   default:
-      flags = PIPE_TRANSFER_READ_WRITE;
-      break;      
-   }
-
-   /* Handle zero-size buffers here rather than in drivers */
-   if (obj->Size == 0) {
-      obj->Pointer = &st_bufferobj_zero_length;
-   }
-   else {
-      obj->Pointer = pipe_buffer_map(st_context(ctx)->pipe,
-                                     st_obj->buffer,
-                                     flags,
-                                     &st_obj->transfer);
-   }
-
-   if (obj->Pointer) {
-      obj->Offset = 0;
-      obj->Length = obj->Size;
-   }
-   return obj->Pointer;
-}
-
-
-/**
- * Called via glMapBufferRange().
- */
-static void *
-st_bufferobj_map_range(struct gl_context *ctx, GLenum target, 
-                       GLintptr offset, GLsizeiptr length, GLbitfield access,
-                       struct gl_buffer_object *obj)
-{
-   struct pipe_context *pipe = st_context(ctx)->pipe;
-   struct st_buffer_object *st_obj = st_buffer_object(obj);
-   enum pipe_transfer_usage flags = 0x0;
-
-   if (access & GL_MAP_WRITE_BIT)
-      flags |= PIPE_TRANSFER_WRITE;
-
-   if (access & GL_MAP_READ_BIT)
-      flags |= PIPE_TRANSFER_READ;
-
-   if (access & GL_MAP_FLUSH_EXPLICIT_BIT)
-      flags |= PIPE_TRANSFER_FLUSH_EXPLICIT;
-
-   if (access & GL_MAP_INVALIDATE_RANGE_BIT)
-      flags |= PIPE_TRANSFER_DISCARD;
-
-   if (access & GL_MAP_INVALIDATE_BUFFER_BIT)
-      flags |= PIPE_TRANSFER_DISCARD;
-   
-   if (access & GL_MAP_UNSYNCHRONIZED_BIT)
-      flags |= PIPE_TRANSFER_UNSYNCHRONIZED;
-
-   /* ... other flags ...
-    */
-
-   if (access & MESA_MAP_NOWAIT_BIT)
-      flags |= PIPE_TRANSFER_DONTBLOCK;
-
-   assert(offset >= 0);
-   assert(length >= 0);
-   assert(offset < obj->Size);
-   assert(offset + length <= obj->Size);
-
-   /*
-    * We go out of way here to hide the degenerate yet valid case of zero
-    * length range from the pipe driver.
-    */
-   if (!length) {
-      obj->Pointer = &st_bufferobj_zero_length;
-   }
-   else {
-      obj->Pointer = pipe_buffer_map_range(pipe, 
-                                           st_obj->buffer,
-                                           offset, length,
-                                           flags,
-                                           &st_obj->transfer);
-      if (obj->Pointer) {
-         obj->Pointer = (ubyte *) obj->Pointer + offset;
-      }
-   }
-   
-   if (obj->Pointer) {
-      obj->Offset = offset;
-      obj->Length = length;
-      obj->AccessFlags = access;
-   }
-
-   return obj->Pointer;
-}
-
-
-static void
-st_bufferobj_flush_mapped_range(struct gl_context *ctx, GLenum target, 
-                                GLintptr offset, GLsizeiptr length,
-                                struct gl_buffer_object *obj)
-{
-   struct pipe_context *pipe = st_context(ctx)->pipe;
-   struct st_buffer_object *st_obj = st_buffer_object(obj);
-
-   /* Subrange is relative to mapped range */
-   assert(offset >= 0);
-   assert(length >= 0);
-   assert(offset + length <= obj->Length);
-   assert(obj->Pointer);
-   
-   if (!length)
-      return;
-
-   pipe_buffer_flush_mapped_range(pipe, st_obj->transfer, 
-                                  obj->Offset + offset, length);
-}
-
-
-/**
- * Called via glUnmapBufferARB().
- */
-static GLboolean
-st_bufferobj_unmap(struct gl_context *ctx, GLenum target, struct gl_buffer_object *obj)
-{
-   struct pipe_context *pipe = st_context(ctx)->pipe;
-   struct st_buffer_object *st_obj = st_buffer_object(obj);
-
-   if (obj->Length)
-      pipe_buffer_unmap(pipe, st_obj->transfer);
-
-   st_obj->transfer = NULL;
-   obj->Pointer = NULL;
-   obj->Offset = 0;
-   obj->Length = 0;
-   return GL_TRUE;
-}
-
-
-/**
- * Called via glCopyBufferSubData().
- */
-static void
-st_copy_buffer_subdata(struct gl_context *ctx,
-                       struct gl_buffer_object *src,
-                       struct gl_buffer_object *dst,
-                       GLintptr readOffset, GLintptr writeOffset,
-                       GLsizeiptr size)
-{
-   struct pipe_context *pipe = st_context(ctx)->pipe;
-   struct st_buffer_object *srcObj = st_buffer_object(src);
-   struct st_buffer_object *dstObj = st_buffer_object(dst);
-   struct pipe_transfer *src_transfer;
-   struct pipe_transfer *dst_transfer;
-   ubyte *srcPtr, *dstPtr;
-
-   if(!size)
-      return;
-
-   /* buffer should not already be mapped */
-   assert(!src->Pointer);
-   assert(!dst->Pointer);
-
-   srcPtr = (ubyte *) pipe_buffer_map_range(pipe,
-                                            srcObj->buffer,
-                                            readOffset, size,
-                                            PIPE_TRANSFER_READ,
-					    &src_transfer);
-
-   dstPtr = (ubyte *) pipe_buffer_map_range(pipe,
-                                            dstObj->buffer,
-                                            writeOffset, size,
-                                            PIPE_TRANSFER_WRITE,
-					    &dst_transfer);
-
-   if (srcPtr && dstPtr)
-      memcpy(dstPtr + writeOffset, srcPtr + readOffset, size);
-
-   pipe_buffer_unmap(pipe, src_transfer);
-   pipe_buffer_unmap(pipe, dst_transfer);
-}
-
-
-/* TODO: if buffer wasn't created with appropriate usage flags, need
- * to recreate it now and copy contents -- or possibly create a
- * gallium entrypoint to extend the usage flags and let the driver
- * decide if a copy is necessary.
- */
-void
-st_bufferobj_validate_usage(struct st_context *st,
-			    struct st_buffer_object *obj,
-			    unsigned usage)
-{
-}
-
-
-void
-st_init_bufferobject_functions(struct dd_function_table *functions)
-{
-   functions->NewBufferObject = st_bufferobj_alloc;
-   functions->DeleteBuffer = st_bufferobj_free;
-   functions->BufferData = st_bufferobj_data;
-   functions->BufferSubData = st_bufferobj_subdata;
-   functions->GetBufferSubData = st_bufferobj_get_subdata;
-   functions->MapBuffer = st_bufferobj_map;
-   functions->MapBufferRange = st_bufferobj_map_range;
-   functions->FlushMappedBufferRange = st_bufferobj_flush_mapped_range;
-   functions->UnmapBuffer = st_bufferobj_unmap;
-   functions->CopyBufferSubData = st_copy_buffer_subdata;
-
-   /* For GL_APPLE_vertex_array_object */
-   functions->NewArrayObject = _mesa_new_array_object;
-   functions->DeleteArrayObject = _mesa_delete_array_object;
-}
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * Functions for pixel buffer objects and vertex/element buffer objects.
+ */
+
+
+#include "main/imports.h"
+#include "main/mtypes.h"
+#include "main/arrayobj.h"
+#include "main/bufferobj.h"
+
+#include "st_context.h"
+#include "st_cb_bufferobjects.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+
+
+/**
+ * There is some duplication between mesa's bufferobjects and our
+ * bufmgr buffers.  Both have an integer handle and a hashtable to
+ * lookup an opaque structure.  It would be nice if the handles and
+ * internal structure where somehow shared.
+ */
+static struct gl_buffer_object *
+st_bufferobj_alloc(struct gl_context *ctx, GLuint name, GLenum target)
+{
+   struct st_buffer_object *st_obj = ST_CALLOC_STRUCT(st_buffer_object);
+
+   if (!st_obj)
+      return NULL;
+
+   _mesa_initialize_buffer_object(&st_obj->Base, name, target);
+
+   return &st_obj->Base;
+}
+
+
+
+/**
+ * Deallocate/free a vertex/pixel buffer object.
+ * Called via glDeleteBuffersARB().
+ */
+static void
+st_bufferobj_free(struct gl_context *ctx, struct gl_buffer_object *obj)
+{
+   struct st_buffer_object *st_obj = st_buffer_object(obj);
+
+   assert(obj->RefCount == 0);
+   assert(st_obj->transfer == NULL);
+
+   if (st_obj->buffer) 
+      pipe_resource_reference(&st_obj->buffer, NULL);
+
+   free(st_obj);
+}
+
+
+
+/**
+ * Replace data in a subrange of buffer object.  If the data range
+ * specified by size + offset extends beyond the end of the buffer or
+ * if data is NULL, no copy is performed.
+ * Called via glBufferSubDataARB().
+ */
+static void
+st_bufferobj_subdata(struct gl_context *ctx,
+		     GLenum target,
+		     GLintptrARB offset,
+		     GLsizeiptrARB size,
+		     const GLvoid * data, struct gl_buffer_object *obj)
+{
+   struct st_buffer_object *st_obj = st_buffer_object(obj);
+
+   /* we may be called from VBO code, so double-check params here */
+   ASSERT(offset >= 0);
+   ASSERT(size >= 0);
+   ASSERT(offset + size <= obj->Size);
+
+   if (!size)
+      return;
+
+   /*
+    * According to ARB_vertex_buffer_object specification, if data is null,
+    * then the contents of the buffer object's data store is undefined. We just
+    * ignore, and leave it unchanged.
+    */
+   if (!data)
+      return;
+
+   /* Now that transfers are per-context, we don't have to figure out
+    * flushing here.  Usually drivers won't need to flush in this case
+    * even if the buffer is currently referenced by hardware - they
+    * just queue the upload as dma rather than mapping the underlying
+    * buffer directly.
+    */
+   pipe_buffer_write(st_context(ctx)->pipe,
+		     st_obj->buffer,
+		     offset, size, data);
+}
+
+
+/**
+ * Called via glGetBufferSubDataARB().
+ */
+static void
+st_bufferobj_get_subdata(struct gl_context *ctx,
+                         GLenum target,
+                         GLintptrARB offset,
+                         GLsizeiptrARB size,
+                         GLvoid * data, struct gl_buffer_object *obj)
+{
+   struct st_buffer_object *st_obj = st_buffer_object(obj);
+
+   /* we may be called from VBO code, so double-check params here */
+   ASSERT(offset >= 0);
+   ASSERT(size >= 0);
+   ASSERT(offset + size <= obj->Size);
+
+   if (!size)
+      return;
+
+   pipe_buffer_read(st_context(ctx)->pipe, st_obj->buffer,
+                    offset, size, data);
+}
+
+
+/**
+ * Allocate space for and store data in a buffer object.  Any data that was
+ * previously stored in the buffer object is lost.  If data is NULL,
+ * memory will be allocated, but no copy will occur.
+ * Called via ctx->Driver.BufferData().
+ * \return GL_TRUE for success, GL_FALSE if out of memory
+ */
+static GLboolean
+st_bufferobj_data(struct gl_context *ctx,
+		  GLenum target,
+		  GLsizeiptrARB size,
+		  const GLvoid * data,
+		  GLenum usage, 
+		  struct gl_buffer_object *obj)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+   struct st_buffer_object *st_obj = st_buffer_object(obj);
+   unsigned bind, pipe_usage;
+
+   st_obj->Base.Size = size;
+   st_obj->Base.Usage = usage;
+   
+   switch(target) {
+   case GL_PIXEL_PACK_BUFFER_ARB:
+   case GL_PIXEL_UNPACK_BUFFER_ARB:
+      bind = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW;
+      break;
+   case GL_ARRAY_BUFFER_ARB:
+      bind = PIPE_BIND_VERTEX_BUFFER;
+      break;
+   case GL_ELEMENT_ARRAY_BUFFER_ARB:
+      bind = PIPE_BIND_INDEX_BUFFER;
+      break;
+   default:
+      bind = 0;
+   }
+
+   switch (usage) {
+   case GL_STATIC_DRAW:
+   case GL_STATIC_READ:
+   case GL_STATIC_COPY:
+      pipe_usage = PIPE_USAGE_STATIC;
+      break;
+   case GL_DYNAMIC_DRAW:
+   case GL_DYNAMIC_READ:
+   case GL_DYNAMIC_COPY:
+      pipe_usage = PIPE_USAGE_DYNAMIC;
+      break;
+   case GL_STREAM_DRAW:
+   case GL_STREAM_READ:
+   case GL_STREAM_COPY:
+      pipe_usage = PIPE_USAGE_STREAM;
+      break;
+   default:
+      pipe_usage = PIPE_USAGE_DEFAULT;
+   }
+
+   pipe_resource_reference( &st_obj->buffer, NULL );
+
+   if (size != 0) {
+      st_obj->buffer = pipe_buffer_create(pipe->screen, bind,
+                                          pipe_usage, size);
+
+      if (!st_obj->buffer) {
+         return GL_FALSE;
+      }
+
+      if (data)
+         pipe_buffer_write(st_context(ctx)->pipe, st_obj->buffer, 0,
+				       size, data);
+      return GL_TRUE;
+   }
+
+   return GL_TRUE;
+}
+
+
+/**
+ * Dummy data whose's pointer is used for zero size buffers or ranges.
+ */
+static long st_bufferobj_zero_length = 0;
+
+
+
+/**
+ * Called via glMapBufferARB().
+ */
+static void *
+st_bufferobj_map(struct gl_context *ctx, GLenum target, GLenum access,
+                 struct gl_buffer_object *obj)
+{
+   struct st_buffer_object *st_obj = st_buffer_object(obj);
+   uint flags;
+
+   switch (access) {
+   case GL_WRITE_ONLY:
+      flags = PIPE_TRANSFER_WRITE;
+      break;
+   case GL_READ_ONLY:
+      flags = PIPE_TRANSFER_READ;
+      break;
+   case GL_READ_WRITE:
+   default:
+      flags = PIPE_TRANSFER_READ_WRITE;
+      break;      
+   }
+
+   /* Handle zero-size buffers here rather than in drivers */
+   if (obj->Size == 0) {
+      obj->Pointer = &st_bufferobj_zero_length;
+   }
+   else {
+      obj->Pointer = pipe_buffer_map(st_context(ctx)->pipe,
+                                     st_obj->buffer,
+                                     flags,
+                                     &st_obj->transfer);
+   }
+
+   if (obj->Pointer) {
+      obj->Offset = 0;
+      obj->Length = obj->Size;
+   }
+   return obj->Pointer;
+}
+
+
+/**
+ * Called via glMapBufferRange().
+ */
+static void *
+st_bufferobj_map_range(struct gl_context *ctx, GLenum target, 
+                       GLintptr offset, GLsizeiptr length, GLbitfield access,
+                       struct gl_buffer_object *obj)
+{
+   struct pipe_context *pipe = st_context(ctx)->pipe;
+   struct st_buffer_object *st_obj = st_buffer_object(obj);
+   enum pipe_transfer_usage flags = 0x0;
+
+   if (access & GL_MAP_WRITE_BIT)
+      flags |= PIPE_TRANSFER_WRITE;
+
+   if (access & GL_MAP_READ_BIT)
+      flags |= PIPE_TRANSFER_READ;
+
+   if (access & GL_MAP_FLUSH_EXPLICIT_BIT)
+      flags |= PIPE_TRANSFER_FLUSH_EXPLICIT;
+
+   if (access & GL_MAP_INVALIDATE_RANGE_BIT)
+      flags |= PIPE_TRANSFER_DISCARD;
+
+   if (access & GL_MAP_INVALIDATE_BUFFER_BIT)
+      flags |= PIPE_TRANSFER_DISCARD;
+   
+   if (access & GL_MAP_UNSYNCHRONIZED_BIT)
+      flags |= PIPE_TRANSFER_UNSYNCHRONIZED;
+
+   /* ... other flags ...
+    */
+
+   if (access & MESA_MAP_NOWAIT_BIT)
+      flags |= PIPE_TRANSFER_DONTBLOCK;
+
+   assert(offset >= 0);
+   assert(length >= 0);
+   assert(offset < obj->Size);
+   assert(offset + length <= obj->Size);
+
+   /*
+    * We go out of way here to hide the degenerate yet valid case of zero
+    * length range from the pipe driver.
+    */
+   if (!length) {
+      obj->Pointer = &st_bufferobj_zero_length;
+   }
+   else {
+      obj->Pointer = pipe_buffer_map_range(pipe, 
+                                           st_obj->buffer,
+                                           offset, length,
+                                           flags,
+                                           &st_obj->transfer);
+      if (obj->Pointer) {
+         obj->Pointer = (ubyte *) obj->Pointer + offset;
+      }
+   }
+   
+   if (obj->Pointer) {
+      obj->Offset = offset;
+      obj->Length = length;
+      obj->AccessFlags = access;
+   }
+
+   return obj->Pointer;
+}
+
+
+static void
+st_bufferobj_flush_mapped_range(struct gl_context *ctx, GLenum target, 
+                                GLintptr offset, GLsizeiptr length,
+                                struct gl_buffer_object *obj)
+{
+   struct pipe_context *pipe = st_context(ctx)->pipe;
+   struct st_buffer_object *st_obj = st_buffer_object(obj);
+
+   /* Subrange is relative to mapped range */
+   assert(offset >= 0);
+   assert(length >= 0);
+   assert(offset + length <= obj->Length);
+   assert(obj->Pointer);
+   
+   if (!length)
+      return;
+
+   pipe_buffer_flush_mapped_range(pipe, st_obj->transfer, 
+                                  obj->Offset + offset, length);
+}
+
+
+/**
+ * Called via glUnmapBufferARB().
+ */
+static GLboolean
+st_bufferobj_unmap(struct gl_context *ctx, GLenum target, struct gl_buffer_object *obj)
+{
+   struct pipe_context *pipe = st_context(ctx)->pipe;
+   struct st_buffer_object *st_obj = st_buffer_object(obj);
+
+   if (obj->Length)
+      pipe_buffer_unmap(pipe, st_obj->transfer);
+
+   st_obj->transfer = NULL;
+   obj->Pointer = NULL;
+   obj->Offset = 0;
+   obj->Length = 0;
+   return GL_TRUE;
+}
+
+
+/**
+ * Called via glCopyBufferSubData().
+ */
+static void
+st_copy_buffer_subdata(struct gl_context *ctx,
+                       struct gl_buffer_object *src,
+                       struct gl_buffer_object *dst,
+                       GLintptr readOffset, GLintptr writeOffset,
+                       GLsizeiptr size)
+{
+   struct pipe_context *pipe = st_context(ctx)->pipe;
+   struct st_buffer_object *srcObj = st_buffer_object(src);
+   struct st_buffer_object *dstObj = st_buffer_object(dst);
+   struct pipe_transfer *src_transfer;
+   struct pipe_transfer *dst_transfer;
+   ubyte *srcPtr, *dstPtr;
+
+   if(!size)
+      return;
+
+   /* buffer should not already be mapped */
+   assert(!src->Pointer);
+   assert(!dst->Pointer);
+
+   srcPtr = (ubyte *) pipe_buffer_map_range(pipe,
+                                            srcObj->buffer,
+                                            readOffset, size,
+                                            PIPE_TRANSFER_READ,
+					    &src_transfer);
+
+   dstPtr = (ubyte *) pipe_buffer_map_range(pipe,
+                                            dstObj->buffer,
+                                            writeOffset, size,
+                                            PIPE_TRANSFER_WRITE,
+					    &dst_transfer);
+
+   if (srcPtr && dstPtr)
+      memcpy(dstPtr + writeOffset, srcPtr + readOffset, size);
+
+   pipe_buffer_unmap(pipe, src_transfer);
+   pipe_buffer_unmap(pipe, dst_transfer);
+}
+
+
+/* TODO: if buffer wasn't created with appropriate usage flags, need
+ * to recreate it now and copy contents -- or possibly create a
+ * gallium entrypoint to extend the usage flags and let the driver
+ * decide if a copy is necessary.
+ */
+void
+st_bufferobj_validate_usage(struct st_context *st,
+			    struct st_buffer_object *obj,
+			    unsigned usage)
+{
+}
+
+
+void
+st_init_bufferobject_functions(struct dd_function_table *functions)
+{
+   functions->NewBufferObject = st_bufferobj_alloc;
+   functions->DeleteBuffer = st_bufferobj_free;
+   functions->BufferData = st_bufferobj_data;
+   functions->BufferSubData = st_bufferobj_subdata;
+   functions->GetBufferSubData = st_bufferobj_get_subdata;
+   functions->MapBuffer = st_bufferobj_map;
+   functions->MapBufferRange = st_bufferobj_map_range;
+   functions->FlushMappedBufferRange = st_bufferobj_flush_mapped_range;
+   functions->UnmapBuffer = st_bufferobj_unmap;
+   functions->CopyBufferSubData = st_copy_buffer_subdata;
+
+   /* For GL_APPLE_vertex_array_object */
+   functions->NewArrayObject = _mesa_new_array_object;
+   functions->DeleteArrayObject = _mesa_delete_array_object;
+}
diff --git a/mesalib/src/mesa/state_tracker/st_cb_clear.c b/mesalib/src/mesa/state_tracker/st_cb_clear.c
index 3e27be271..d2e0cd73c 100644
--- a/mesalib/src/mesa/state_tracker/st_cb_clear.c
+++ b/mesalib/src/mesa/state_tracker/st_cb_clear.c
@@ -1,559 +1,563 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * Copyright 2009 VMware, Inc.  All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  *   Brian Paul
-  *   Michel Dänzer
-  */
-
-#include "main/glheader.h"
-#include "main/formats.h"
-#include "main/macros.h"
-#include "program/prog_instruction.h"
-#include "st_context.h"
-#include "st_atom.h"
-#include "st_cb_accum.h"
-#include "st_cb_clear.h"
-#include "st_cb_fbo.h"
-#include "st_program.h"
-
-#include "pipe/p_context.h"
-#include "pipe/p_shader_tokens.h"
-#include "pipe/p_state.h"
-#include "pipe/p_defines.h"
-#include "util/u_format.h"
-#include "util/u_inlines.h"
-#include "util/u_simple_shaders.h"
-#include "util/u_draw_quad.h"
-
-#include "cso_cache/cso_context.h"
-
-
-/**
- * Do per-context initialization for glClear.
- */
-void
-st_init_clear(struct st_context *st)
-{
-   struct pipe_context *pipe = st->pipe;
-   struct pipe_screen *pscreen = st->pipe->screen;
-
-   memset(&st->clear, 0, sizeof(st->clear));
-
-   st->clear.raster.gl_rasterization_rules = 1;
-   st->clear.enable_ds_separate = pscreen->get_param(pscreen, PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE);
-
-   /* fragment shader state: color pass-through program */
-   st->clear.fs = util_make_fragment_passthrough_shader(pipe);
-
-   /* vertex shader state: color/position pass-through */
-   {
-      const uint semantic_names[] = { TGSI_SEMANTIC_POSITION,
-                                      TGSI_SEMANTIC_COLOR };
-      const uint semantic_indexes[] = { 0, 0 };
-      st->clear.vs = util_make_vertex_passthrough_shader(pipe, 2,
-                                                         semantic_names,
-                                                         semantic_indexes);
-   }
-}
-
-
-/**
- * Free per-context state for glClear.
- */
-void
-st_destroy_clear(struct st_context *st)
-{
-   if (st->clear.fs) {
-      cso_delete_fragment_shader(st->cso_context, st->clear.fs);
-      st->clear.fs = NULL;
-   }
-   if (st->clear.vs) {
-      cso_delete_vertex_shader(st->cso_context, st->clear.vs);
-      st->clear.vs = NULL;
-   }
-   if (st->clear.vbuf) {
-      pipe_resource_reference(&st->clear.vbuf, NULL);
-      st->clear.vbuf = NULL;
-   }
-}
-
-
-/**
- * Draw a screen-aligned quadrilateral.
- * Coords are clip coords with y=0=bottom.
- */
-static void
-draw_quad(struct st_context *st,
-          float x0, float y0, float x1, float y1, GLfloat z,
-          const GLfloat color[4])
-{
-   struct pipe_context *pipe = st->pipe;
-
-   /* XXX: Need to improve buffer_write to allow NO_WAIT (as well as
-    * no_flush) updates to buffers where we know there is no conflict
-    * with previous data.  Currently using max_slots > 1 will cause
-    * synchronous rendering if the driver flushes its command buffers
-    * between one bitmap and the next.  Our flush hook below isn't
-    * sufficient to catch this as the driver doesn't tell us when it
-    * flushes its own command buffers.  Until this gets fixed, pay the
-    * price of allocating a new buffer for each bitmap cache-flush to
-    * avoid synchronous rendering.
-    */
-   const GLuint max_slots = 1; /* 1024 / sizeof(st->clear.vertices); */
-   GLuint i;
-
-   if (st->clear.vbuf_slot >= max_slots) {
-      pipe_resource_reference(&st->clear.vbuf, NULL);
-      st->clear.vbuf_slot = 0;
-   }
-
-   if (!st->clear.vbuf) {
-      st->clear.vbuf = pipe_buffer_create(pipe->screen,
-                                          PIPE_BIND_VERTEX_BUFFER,
-                                          max_slots * sizeof(st->clear.vertices));
-   }
-
-   /* positions */
-   st->clear.vertices[0][0][0] = x0;
-   st->clear.vertices[0][0][1] = y0;
-
-   st->clear.vertices[1][0][0] = x1;
-   st->clear.vertices[1][0][1] = y0;
-
-   st->clear.vertices[2][0][0] = x1;
-   st->clear.vertices[2][0][1] = y1;
-
-   st->clear.vertices[3][0][0] = x0;
-   st->clear.vertices[3][0][1] = y1;
-
-   /* same for all verts: */
-   for (i = 0; i < 4; i++) {
-      st->clear.vertices[i][0][2] = z;
-      st->clear.vertices[i][0][3] = 1.0;
-      st->clear.vertices[i][1][0] = color[0];
-      st->clear.vertices[i][1][1] = color[1];
-      st->clear.vertices[i][1][2] = color[2];
-      st->clear.vertices[i][1][3] = color[3];
-   }
-
-   /* put vertex data into vbuf */
-   pipe_buffer_write_nooverlap(st->pipe, st->clear.vbuf,
-                                           st->clear.vbuf_slot
-                                             * sizeof(st->clear.vertices),
-                                           sizeof(st->clear.vertices),
-                                           st->clear.vertices);
-
-   /* draw */
-   util_draw_vertex_buffer(pipe, 
-                           st->clear.vbuf, 
-                           st->clear.vbuf_slot * sizeof(st->clear.vertices),
-                           PIPE_PRIM_TRIANGLE_FAN,
-                           4,  /* verts */
-                           2); /* attribs/vert */
-
-   /* Increment slot */
-   st->clear.vbuf_slot++;
-}
-
-
-
-/**
- * Do glClear by drawing a quadrilateral.
- * The vertices of the quad will be computed from the
- * ctx->DrawBuffer->_X/Ymin/max fields.
- */
-static void
-clear_with_quad(struct gl_context *ctx,
-                GLboolean color, GLboolean depth, GLboolean stencil)
-{
-   struct st_context *st = st_context(ctx);
-   const struct gl_framebuffer *fb = ctx->DrawBuffer;
-   const GLfloat fb_width = (GLfloat) fb->Width;
-   const GLfloat fb_height = (GLfloat) fb->Height;
-   const GLfloat x0 = (GLfloat) ctx->DrawBuffer->_Xmin / fb_width * 2.0f - 1.0f;
-   const GLfloat x1 = (GLfloat) ctx->DrawBuffer->_Xmax / fb_width * 2.0f - 1.0f;
-   const GLfloat y0 = (GLfloat) ctx->DrawBuffer->_Ymin / fb_height * 2.0f - 1.0f;
-   const GLfloat y1 = (GLfloat) ctx->DrawBuffer->_Ymax / fb_height * 2.0f - 1.0f;
-
-   /*
-   printf("%s %s%s%s %f,%f %f,%f\n", __FUNCTION__, 
-	  color ? "color, " : "",
-	  depth ? "depth, " : "",
-	  stencil ? "stencil" : "",
-	  x0, y0,
-	  x1, y1);
-   */
-
-   cso_save_blend(st->cso_context);
-   cso_save_stencil_ref(st->cso_context);
-   cso_save_depth_stencil_alpha(st->cso_context);
-   cso_save_rasterizer(st->cso_context);
-   cso_save_viewport(st->cso_context);
-   cso_save_clip(st->cso_context);
-   cso_save_fragment_shader(st->cso_context);
-   cso_save_vertex_shader(st->cso_context);
-   cso_save_vertex_elements(st->cso_context);
-
-   /* blend state: RGBA masking */
-   {
-      struct pipe_blend_state blend;
-      memset(&blend, 0, sizeof(blend));
-      blend.rt[0].rgb_src_factor = PIPE_BLENDFACTOR_ONE;
-      blend.rt[0].alpha_src_factor = PIPE_BLENDFACTOR_ONE;
-      blend.rt[0].rgb_dst_factor = PIPE_BLENDFACTOR_ZERO;
-      blend.rt[0].alpha_dst_factor = PIPE_BLENDFACTOR_ZERO;
-      if (color) {
-         if (ctx->Color.ColorMask[0][0])
-            blend.rt[0].colormask |= PIPE_MASK_R;
-         if (ctx->Color.ColorMask[0][1])
-            blend.rt[0].colormask |= PIPE_MASK_G;
-         if (ctx->Color.ColorMask[0][2])
-            blend.rt[0].colormask |= PIPE_MASK_B;
-         if (ctx->Color.ColorMask[0][3])
-            blend.rt[0].colormask |= PIPE_MASK_A;
-         if (st->ctx->Color.DitherFlag)
-            blend.dither = 1;
-      }
-      cso_set_blend(st->cso_context, &blend);
-   }
-
-   /* depth_stencil state: always pass/set to ref value */
-   {
-      struct pipe_depth_stencil_alpha_state depth_stencil;
-      memset(&depth_stencil, 0, sizeof(depth_stencil));
-      if (depth) {
-         depth_stencil.depth.enabled = 1;
-         depth_stencil.depth.writemask = 1;
-         depth_stencil.depth.func = PIPE_FUNC_ALWAYS;
-      }
-
-      if (stencil) {
-         struct pipe_stencil_ref stencil_ref;
-         memset(&stencil_ref, 0, sizeof(stencil_ref));
-         depth_stencil.stencil[0].enabled = 1;
-         depth_stencil.stencil[0].func = PIPE_FUNC_ALWAYS;
-         depth_stencil.stencil[0].fail_op = PIPE_STENCIL_OP_REPLACE;
-         depth_stencil.stencil[0].zpass_op = PIPE_STENCIL_OP_REPLACE;
-         depth_stencil.stencil[0].zfail_op = PIPE_STENCIL_OP_REPLACE;
-         depth_stencil.stencil[0].valuemask = 0xff;
-         depth_stencil.stencil[0].writemask = ctx->Stencil.WriteMask[0] & 0xff;
-         stencil_ref.ref_value[0] = ctx->Stencil.Clear;
-         cso_set_stencil_ref(st->cso_context, &stencil_ref);
-      }
-
-      cso_set_depth_stencil_alpha(st->cso_context, &depth_stencil);
-   }
-
-   cso_set_vertex_elements(st->cso_context, 2, st->velems_util_draw);
-
-   cso_set_rasterizer(st->cso_context, &st->clear.raster);
-
-   /* viewport state: viewport matching window dims */
-   {
-      const GLboolean invert = (st_fb_orientation(fb) == Y_0_TOP);
-      struct pipe_viewport_state vp;
-      vp.scale[0] = 0.5f * fb_width;
-      vp.scale[1] = fb_height * (invert ? -0.5f : 0.5f);
-      vp.scale[2] = 1.0f;
-      vp.scale[3] = 1.0f;
-      vp.translate[0] = 0.5f * fb_width;
-      vp.translate[1] = 0.5f * fb_height;
-      vp.translate[2] = 0.0f;
-      vp.translate[3] = 0.0f;
-      cso_set_viewport(st->cso_context, &vp);
-   }
-
-   cso_set_clip(st->cso_context, &st->clear.clip);
-   cso_set_fragment_shader_handle(st->cso_context, st->clear.fs);
-   cso_set_vertex_shader_handle(st->cso_context, st->clear.vs);
-
-   /* draw quad matching scissor rect (XXX verify coord round-off) */
-   draw_quad(st, x0, y0, x1, y1,
-             (GLfloat) ctx->Depth.Clear, ctx->Color.ClearColor);
-
-   /* Restore pipe state */
-   cso_restore_blend(st->cso_context);
-   cso_restore_stencil_ref(st->cso_context);
-   cso_restore_depth_stencil_alpha(st->cso_context);
-   cso_restore_rasterizer(st->cso_context);
-   cso_restore_viewport(st->cso_context);
-   cso_restore_clip(st->cso_context);
-   cso_restore_fragment_shader(st->cso_context);
-   cso_restore_vertex_shader(st->cso_context);
-   cso_restore_vertex_elements(st->cso_context);
-}
-
-
-/**
- * Determine if we need to clear the depth buffer by drawing a quad.
- */
-static INLINE GLboolean
-check_clear_color_with_quad(struct gl_context *ctx, struct gl_renderbuffer *rb)
-{
-   if (ctx->Scissor.Enabled &&
-       (ctx->Scissor.X != 0 ||
-        ctx->Scissor.Y != 0 ||
-        ctx->Scissor.Width < rb->Width ||
-        ctx->Scissor.Height < rb->Height))
-      return GL_TRUE;
-
-   if (!ctx->Color.ColorMask[0][0] ||
-       !ctx->Color.ColorMask[0][1] ||
-       !ctx->Color.ColorMask[0][2] ||
-       !ctx->Color.ColorMask[0][3])
-      return GL_TRUE;
-
-   return GL_FALSE;
-}
-
-
-/**
- * Determine if we need to clear the combiend depth/stencil buffer by
- * drawing a quad.
- */
-static INLINE GLboolean
-check_clear_depth_stencil_with_quad(struct gl_context *ctx, struct gl_renderbuffer *rb)
-{
-   const GLuint stencilMax = 0xff;
-   GLboolean maskStencil
-      = (ctx->Stencil.WriteMask[0] & stencilMax) != stencilMax;
-
-   assert(rb->Format == MESA_FORMAT_S8 ||
-          rb->Format == MESA_FORMAT_Z24_S8 ||
-          rb->Format == MESA_FORMAT_S8_Z24);
-
-   if (ctx->Scissor.Enabled &&
-       (ctx->Scissor.X != 0 ||
-        ctx->Scissor.Y != 0 ||
-        ctx->Scissor.Width < rb->Width ||
-        ctx->Scissor.Height < rb->Height))
-      return GL_TRUE;
-
-   if (maskStencil)
-      return GL_TRUE;
-
-   return GL_FALSE;
-}
-
-
-/**
- * Determine if we need to clear the depth buffer by drawing a quad.
- */
-static INLINE GLboolean
-check_clear_depth_with_quad(struct gl_context *ctx, struct gl_renderbuffer *rb,
-                            boolean ds_separate)
-{
-   const struct st_renderbuffer *strb = st_renderbuffer(rb);
-   const GLboolean isDS = util_format_is_depth_and_stencil(strb->surface->format);
-
-   if (ctx->Scissor.Enabled &&
-       (ctx->Scissor.X != 0 ||
-        ctx->Scissor.Y != 0 ||
-        ctx->Scissor.Width < rb->Width ||
-        ctx->Scissor.Height < rb->Height))
-      return GL_TRUE;
-
-   if (!ds_separate && isDS && ctx->DrawBuffer->Visual.stencilBits > 0)
-      return GL_TRUE;
-
-   return GL_FALSE;
-}
-
-
-/**
- * Determine if we need to clear the stencil buffer by drawing a quad.
- */
-static INLINE GLboolean
-check_clear_stencil_with_quad(struct gl_context *ctx, struct gl_renderbuffer *rb,
-                              boolean ds_separate)
-{
-   const struct st_renderbuffer *strb = st_renderbuffer(rb);
-   const GLboolean isDS = util_format_is_depth_and_stencil(strb->surface->format);
-   const GLuint stencilMax = 0xff;
-   const GLboolean maskStencil
-      = (ctx->Stencil.WriteMask[0] & stencilMax) != stencilMax;
-
-   assert(rb->Format == MESA_FORMAT_S8 ||
-          rb->Format == MESA_FORMAT_Z24_S8 ||
-          rb->Format == MESA_FORMAT_S8_Z24);
-
-   if (maskStencil) 
-      return GL_TRUE;
-
-   if (ctx->Scissor.Enabled &&
-       (ctx->Scissor.X != 0 ||
-        ctx->Scissor.Y != 0 ||
-        ctx->Scissor.Width < rb->Width ||
-        ctx->Scissor.Height < rb->Height))
-      return GL_TRUE;
-
-   /* This is correct, but it is necessary to look at the depth clear
-    * value held in the surface when it comes time to issue the clear,
-    * rather than taking depth and stencil clear values from the
-    * current state.
-    */
-   if (!ds_separate && isDS && ctx->DrawBuffer->Visual.depthBits > 0)
-      return GL_TRUE;
-
-   return GL_FALSE;
-}
-
-
-
-/**
- * Called when we need to flush.
- */
-void
-st_flush_clear(struct st_context *st)
-{
-   /* Release vertex buffer to avoid synchronous rendering if we were
-    * to map it in the next frame.
-    */
-   pipe_resource_reference(&st->clear.vbuf, NULL);
-   st->clear.vbuf_slot = 0;
-}
- 
-
-
-/**
- * Called via ctx->Driver.Clear()
- */
-static void
-st_Clear(struct gl_context *ctx, GLbitfield mask)
-{
-   static const GLbitfield BUFFER_BITS_DS
-      = (BUFFER_BIT_DEPTH | BUFFER_BIT_STENCIL);
-   struct st_context *st = st_context(ctx);
-   struct gl_renderbuffer *depthRb
-      = ctx->DrawBuffer->Attachment[BUFFER_DEPTH].Renderbuffer;
-   struct gl_renderbuffer *stencilRb
-      = ctx->DrawBuffer->Attachment[BUFFER_STENCIL].Renderbuffer;
-   GLbitfield quad_buffers = 0x0;
-   GLbitfield clear_buffers = 0x0;
-   GLuint i;
-
-   /* This makes sure the pipe has the latest scissor, etc values */
-   st_validate_state( st );
-
-   if (mask & BUFFER_BITS_COLOR) {
-      for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
-         GLuint b = ctx->DrawBuffer->_ColorDrawBufferIndexes[i];
-
-         if (mask & (1 << b)) {
-            struct gl_renderbuffer *rb
-               = ctx->DrawBuffer->Attachment[b].Renderbuffer;
-            struct st_renderbuffer *strb = st_renderbuffer(rb);
-
-            if (!strb || !strb->surface)
-               continue;
-
-            if (check_clear_color_with_quad( ctx, rb ))
-               quad_buffers |= PIPE_CLEAR_COLOR;
-            else
-               clear_buffers |= PIPE_CLEAR_COLOR;
-         }
-      }
-   }
-
-   if ((mask & BUFFER_BITS_DS) == BUFFER_BITS_DS && depthRb == stencilRb) {
-      /* clearing combined depth + stencil */
-      struct st_renderbuffer *strb = st_renderbuffer(depthRb);
-
-      if (strb->surface) {
-         if (check_clear_depth_stencil_with_quad(ctx, depthRb))
-            quad_buffers |= PIPE_CLEAR_DEPTHSTENCIL;
-         else
-            clear_buffers |= PIPE_CLEAR_DEPTHSTENCIL;
-      }
-   }
-   else {
-      /* separate depth/stencil clears */
-      /* I don't think truly separate buffers are actually possible in gallium or hw? */
-      if (mask & BUFFER_BIT_DEPTH) {
-         struct st_renderbuffer *strb = st_renderbuffer(depthRb);
-
-         if (strb->surface) {
-            if (check_clear_depth_with_quad(ctx, depthRb,
-                                            st->clear.enable_ds_separate))
-               quad_buffers |= PIPE_CLEAR_DEPTH;
-            else
-               clear_buffers |= PIPE_CLEAR_DEPTH;
-         }
-      }
-      if (mask & BUFFER_BIT_STENCIL) {
-         struct st_renderbuffer *strb = st_renderbuffer(stencilRb);
-
-         if (strb->surface) {
-            if (check_clear_stencil_with_quad(ctx, stencilRb,
-                                              st->clear.enable_ds_separate))
-               quad_buffers |= PIPE_CLEAR_STENCIL;
-            else
-               clear_buffers |= PIPE_CLEAR_STENCIL;
-         }
-      }
-   }
-
-   /*
-    * If we're going to use clear_with_quad() for any reason, use it for
-    * everything possible.
-    */
-   if (quad_buffers) {
-      quad_buffers |= clear_buffers;
-      clear_with_quad(ctx,
-                      quad_buffers & PIPE_CLEAR_COLOR,
-                      quad_buffers & PIPE_CLEAR_DEPTH,
-                      quad_buffers & PIPE_CLEAR_STENCIL);
-   } else if (clear_buffers) {
-      /* driver cannot know it can clear everything if the buffer
-       * is a combined depth/stencil buffer but this wasn't actually
-       * required from the visual. Hence fix this up to avoid potential
-       * read-modify-write in the driver.
-       */
-      if ((clear_buffers & PIPE_CLEAR_DEPTHSTENCIL) &&
-          ((clear_buffers & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL) &&
-          (depthRb == stencilRb) &&
-          (ctx->DrawBuffer->Visual.depthBits == 0 ||
-           ctx->DrawBuffer->Visual.stencilBits == 0))
-         clear_buffers |= PIPE_CLEAR_DEPTHSTENCIL;
-      st->pipe->clear(st->pipe, clear_buffers, ctx->Color.ClearColor,
-                      ctx->Depth.Clear, ctx->Stencil.Clear);
-   }
-   if (mask & BUFFER_BIT_ACCUM)
-      st_clear_accum_buffer(ctx,
-                            ctx->DrawBuffer->Attachment[BUFFER_ACCUM].Renderbuffer);
-}
-
-
-void
-st_init_clear_functions(struct dd_function_table *functions)
-{
-   functions->Clear = st_Clear;
-}
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * Copyright 2009 VMware, Inc.  All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Brian Paul
+  *   Michel Dänzer
+  */
+
+#include "main/glheader.h"
+#include "main/formats.h"
+#include "main/macros.h"
+#include "program/prog_instruction.h"
+#include "st_context.h"
+#include "st_atom.h"
+#include "st_cb_accum.h"
+#include "st_cb_clear.h"
+#include "st_cb_fbo.h"
+#include "st_program.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_format.h"
+#include "util/u_inlines.h"
+#include "util/u_simple_shaders.h"
+#include "util/u_draw_quad.h"
+
+#include "cso_cache/cso_context.h"
+
+
+/**
+ * Do per-context initialization for glClear.
+ */
+void
+st_init_clear(struct st_context *st)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_screen *pscreen = st->pipe->screen;
+
+   memset(&st->clear, 0, sizeof(st->clear));
+
+   st->clear.raster.gl_rasterization_rules = 1;
+   st->clear.enable_ds_separate = pscreen->get_param(pscreen, PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE);
+
+   /* fragment shader state: color pass-through program */
+   st->clear.fs = util_make_fragment_passthrough_shader(pipe);
+
+   /* vertex shader state: color/position pass-through */
+   {
+      const uint semantic_names[] = { TGSI_SEMANTIC_POSITION,
+                                      TGSI_SEMANTIC_COLOR };
+      const uint semantic_indexes[] = { 0, 0 };
+      st->clear.vs = util_make_vertex_passthrough_shader(pipe, 2,
+                                                         semantic_names,
+                                                         semantic_indexes);
+   }
+}
+
+
+/**
+ * Free per-context state for glClear.
+ */
+void
+st_destroy_clear(struct st_context *st)
+{
+   if (st->clear.fs) {
+      cso_delete_fragment_shader(st->cso_context, st->clear.fs);
+      st->clear.fs = NULL;
+   }
+   if (st->clear.vs) {
+      cso_delete_vertex_shader(st->cso_context, st->clear.vs);
+      st->clear.vs = NULL;
+   }
+   if (st->clear.vbuf) {
+      pipe_resource_reference(&st->clear.vbuf, NULL);
+      st->clear.vbuf = NULL;
+   }
+}
+
+
+/**
+ * Draw a screen-aligned quadrilateral.
+ * Coords are clip coords with y=0=bottom.
+ */
+static void
+draw_quad(struct st_context *st,
+          float x0, float y0, float x1, float y1, GLfloat z,
+          const GLfloat color[4])
+{
+   struct pipe_context *pipe = st->pipe;
+
+   /* XXX: Need to improve buffer_write to allow NO_WAIT (as well as
+    * no_flush) updates to buffers where we know there is no conflict
+    * with previous data.  Currently using max_slots > 1 will cause
+    * synchronous rendering if the driver flushes its command buffers
+    * between one bitmap and the next.  Our flush hook below isn't
+    * sufficient to catch this as the driver doesn't tell us when it
+    * flushes its own command buffers.  Until this gets fixed, pay the
+    * price of allocating a new buffer for each bitmap cache-flush to
+    * avoid synchronous rendering.
+    */
+   const GLuint max_slots = 1; /* 1024 / sizeof(st->clear.vertices); */
+   GLuint i;
+
+   if (st->clear.vbuf_slot >= max_slots) {
+      pipe_resource_reference(&st->clear.vbuf, NULL);
+      st->clear.vbuf_slot = 0;
+   }
+
+   if (!st->clear.vbuf) {
+      st->clear.vbuf = pipe_buffer_create(pipe->screen,
+                                          PIPE_BIND_VERTEX_BUFFER,
+                                          PIPE_USAGE_STREAM,
+                                          max_slots * sizeof(st->clear.vertices));
+   }
+
+   /* positions */
+   st->clear.vertices[0][0][0] = x0;
+   st->clear.vertices[0][0][1] = y0;
+
+   st->clear.vertices[1][0][0] = x1;
+   st->clear.vertices[1][0][1] = y0;
+
+   st->clear.vertices[2][0][0] = x1;
+   st->clear.vertices[2][0][1] = y1;
+
+   st->clear.vertices[3][0][0] = x0;
+   st->clear.vertices[3][0][1] = y1;
+
+   /* same for all verts: */
+   for (i = 0; i < 4; i++) {
+      st->clear.vertices[i][0][2] = z;
+      st->clear.vertices[i][0][3] = 1.0;
+      st->clear.vertices[i][1][0] = color[0];
+      st->clear.vertices[i][1][1] = color[1];
+      st->clear.vertices[i][1][2] = color[2];
+      st->clear.vertices[i][1][3] = color[3];
+   }
+
+   /* put vertex data into vbuf */
+   pipe_buffer_write_nooverlap(st->pipe, st->clear.vbuf,
+                                           st->clear.vbuf_slot
+                                             * sizeof(st->clear.vertices),
+                                           sizeof(st->clear.vertices),
+                                           st->clear.vertices);
+
+   /* draw */
+   util_draw_vertex_buffer(pipe,
+                           st->cso_context,
+                           st->clear.vbuf, 
+                           st->clear.vbuf_slot * sizeof(st->clear.vertices),
+                           PIPE_PRIM_TRIANGLE_FAN,
+                           4,  /* verts */
+                           2); /* attribs/vert */
+
+   /* Increment slot */
+   st->clear.vbuf_slot++;
+}
+
+
+
+/**
+ * Do glClear by drawing a quadrilateral.
+ * The vertices of the quad will be computed from the
+ * ctx->DrawBuffer->_X/Ymin/max fields.
+ */
+static void
+clear_with_quad(struct gl_context *ctx,
+                GLboolean color, GLboolean depth, GLboolean stencil)
+{
+   struct st_context *st = st_context(ctx);
+   const struct gl_framebuffer *fb = ctx->DrawBuffer;
+   const GLfloat fb_width = (GLfloat) fb->Width;
+   const GLfloat fb_height = (GLfloat) fb->Height;
+   const GLfloat x0 = (GLfloat) ctx->DrawBuffer->_Xmin / fb_width * 2.0f - 1.0f;
+   const GLfloat x1 = (GLfloat) ctx->DrawBuffer->_Xmax / fb_width * 2.0f - 1.0f;
+   const GLfloat y0 = (GLfloat) ctx->DrawBuffer->_Ymin / fb_height * 2.0f - 1.0f;
+   const GLfloat y1 = (GLfloat) ctx->DrawBuffer->_Ymax / fb_height * 2.0f - 1.0f;
+
+   /*
+   printf("%s %s%s%s %f,%f %f,%f\n", __FUNCTION__, 
+	  color ? "color, " : "",
+	  depth ? "depth, " : "",
+	  stencil ? "stencil" : "",
+	  x0, y0,
+	  x1, y1);
+   */
+
+   cso_save_blend(st->cso_context);
+   cso_save_stencil_ref(st->cso_context);
+   cso_save_depth_stencil_alpha(st->cso_context);
+   cso_save_rasterizer(st->cso_context);
+   cso_save_viewport(st->cso_context);
+   cso_save_clip(st->cso_context);
+   cso_save_fragment_shader(st->cso_context);
+   cso_save_vertex_shader(st->cso_context);
+   cso_save_vertex_elements(st->cso_context);
+   cso_save_vertex_buffers(st->cso_context);
+
+   /* blend state: RGBA masking */
+   {
+      struct pipe_blend_state blend;
+      memset(&blend, 0, sizeof(blend));
+      blend.rt[0].rgb_src_factor = PIPE_BLENDFACTOR_ONE;
+      blend.rt[0].alpha_src_factor = PIPE_BLENDFACTOR_ONE;
+      blend.rt[0].rgb_dst_factor = PIPE_BLENDFACTOR_ZERO;
+      blend.rt[0].alpha_dst_factor = PIPE_BLENDFACTOR_ZERO;
+      if (color) {
+         if (ctx->Color.ColorMask[0][0])
+            blend.rt[0].colormask |= PIPE_MASK_R;
+         if (ctx->Color.ColorMask[0][1])
+            blend.rt[0].colormask |= PIPE_MASK_G;
+         if (ctx->Color.ColorMask[0][2])
+            blend.rt[0].colormask |= PIPE_MASK_B;
+         if (ctx->Color.ColorMask[0][3])
+            blend.rt[0].colormask |= PIPE_MASK_A;
+         if (st->ctx->Color.DitherFlag)
+            blend.dither = 1;
+      }
+      cso_set_blend(st->cso_context, &blend);
+   }
+
+   /* depth_stencil state: always pass/set to ref value */
+   {
+      struct pipe_depth_stencil_alpha_state depth_stencil;
+      memset(&depth_stencil, 0, sizeof(depth_stencil));
+      if (depth) {
+         depth_stencil.depth.enabled = 1;
+         depth_stencil.depth.writemask = 1;
+         depth_stencil.depth.func = PIPE_FUNC_ALWAYS;
+      }
+
+      if (stencil) {
+         struct pipe_stencil_ref stencil_ref;
+         memset(&stencil_ref, 0, sizeof(stencil_ref));
+         depth_stencil.stencil[0].enabled = 1;
+         depth_stencil.stencil[0].func = PIPE_FUNC_ALWAYS;
+         depth_stencil.stencil[0].fail_op = PIPE_STENCIL_OP_REPLACE;
+         depth_stencil.stencil[0].zpass_op = PIPE_STENCIL_OP_REPLACE;
+         depth_stencil.stencil[0].zfail_op = PIPE_STENCIL_OP_REPLACE;
+         depth_stencil.stencil[0].valuemask = 0xff;
+         depth_stencil.stencil[0].writemask = ctx->Stencil.WriteMask[0] & 0xff;
+         stencil_ref.ref_value[0] = ctx->Stencil.Clear;
+         cso_set_stencil_ref(st->cso_context, &stencil_ref);
+      }
+
+      cso_set_depth_stencil_alpha(st->cso_context, &depth_stencil);
+   }
+
+   cso_set_vertex_elements(st->cso_context, 2, st->velems_util_draw);
+
+   cso_set_rasterizer(st->cso_context, &st->clear.raster);
+
+   /* viewport state: viewport matching window dims */
+   {
+      const GLboolean invert = (st_fb_orientation(fb) == Y_0_TOP);
+      struct pipe_viewport_state vp;
+      vp.scale[0] = 0.5f * fb_width;
+      vp.scale[1] = fb_height * (invert ? -0.5f : 0.5f);
+      vp.scale[2] = 1.0f;
+      vp.scale[3] = 1.0f;
+      vp.translate[0] = 0.5f * fb_width;
+      vp.translate[1] = 0.5f * fb_height;
+      vp.translate[2] = 0.0f;
+      vp.translate[3] = 0.0f;
+      cso_set_viewport(st->cso_context, &vp);
+   }
+
+   cso_set_clip(st->cso_context, &st->clear.clip);
+   cso_set_fragment_shader_handle(st->cso_context, st->clear.fs);
+   cso_set_vertex_shader_handle(st->cso_context, st->clear.vs);
+
+   /* draw quad matching scissor rect (XXX verify coord round-off) */
+   draw_quad(st, x0, y0, x1, y1,
+             (GLfloat) ctx->Depth.Clear, ctx->Color.ClearColor);
+
+   /* Restore pipe state */
+   cso_restore_blend(st->cso_context);
+   cso_restore_stencil_ref(st->cso_context);
+   cso_restore_depth_stencil_alpha(st->cso_context);
+   cso_restore_rasterizer(st->cso_context);
+   cso_restore_viewport(st->cso_context);
+   cso_restore_clip(st->cso_context);
+   cso_restore_fragment_shader(st->cso_context);
+   cso_restore_vertex_shader(st->cso_context);
+   cso_restore_vertex_elements(st->cso_context);
+   cso_restore_vertex_buffers(st->cso_context);
+}
+
+
+/**
+ * Determine if we need to clear the depth buffer by drawing a quad.
+ */
+static INLINE GLboolean
+check_clear_color_with_quad(struct gl_context *ctx, struct gl_renderbuffer *rb)
+{
+   if (ctx->Scissor.Enabled &&
+       (ctx->Scissor.X != 0 ||
+        ctx->Scissor.Y != 0 ||
+        ctx->Scissor.Width < rb->Width ||
+        ctx->Scissor.Height < rb->Height))
+      return GL_TRUE;
+
+   if (!ctx->Color.ColorMask[0][0] ||
+       !ctx->Color.ColorMask[0][1] ||
+       !ctx->Color.ColorMask[0][2] ||
+       !ctx->Color.ColorMask[0][3])
+      return GL_TRUE;
+
+   return GL_FALSE;
+}
+
+
+/**
+ * Determine if we need to clear the combiend depth/stencil buffer by
+ * drawing a quad.
+ */
+static INLINE GLboolean
+check_clear_depth_stencil_with_quad(struct gl_context *ctx, struct gl_renderbuffer *rb)
+{
+   const GLuint stencilMax = 0xff;
+   GLboolean maskStencil
+      = (ctx->Stencil.WriteMask[0] & stencilMax) != stencilMax;
+
+   assert(rb->Format == MESA_FORMAT_S8 ||
+          rb->Format == MESA_FORMAT_Z24_S8 ||
+          rb->Format == MESA_FORMAT_S8_Z24);
+
+   if (ctx->Scissor.Enabled &&
+       (ctx->Scissor.X != 0 ||
+        ctx->Scissor.Y != 0 ||
+        ctx->Scissor.Width < rb->Width ||
+        ctx->Scissor.Height < rb->Height))
+      return GL_TRUE;
+
+   if (maskStencil)
+      return GL_TRUE;
+
+   return GL_FALSE;
+}
+
+
+/**
+ * Determine if we need to clear the depth buffer by drawing a quad.
+ */
+static INLINE GLboolean
+check_clear_depth_with_quad(struct gl_context *ctx, struct gl_renderbuffer *rb,
+                            boolean ds_separate)
+{
+   const struct st_renderbuffer *strb = st_renderbuffer(rb);
+   const GLboolean isDS = util_format_is_depth_and_stencil(strb->surface->format);
+
+   if (ctx->Scissor.Enabled &&
+       (ctx->Scissor.X != 0 ||
+        ctx->Scissor.Y != 0 ||
+        ctx->Scissor.Width < rb->Width ||
+        ctx->Scissor.Height < rb->Height))
+      return GL_TRUE;
+
+   if (!ds_separate && isDS && ctx->DrawBuffer->Visual.stencilBits > 0)
+      return GL_TRUE;
+
+   return GL_FALSE;
+}
+
+
+/**
+ * Determine if we need to clear the stencil buffer by drawing a quad.
+ */
+static INLINE GLboolean
+check_clear_stencil_with_quad(struct gl_context *ctx, struct gl_renderbuffer *rb,
+                              boolean ds_separate)
+{
+   const struct st_renderbuffer *strb = st_renderbuffer(rb);
+   const GLboolean isDS = util_format_is_depth_and_stencil(strb->surface->format);
+   const GLuint stencilMax = 0xff;
+   const GLboolean maskStencil
+      = (ctx->Stencil.WriteMask[0] & stencilMax) != stencilMax;
+
+   assert(rb->Format == MESA_FORMAT_S8 ||
+          rb->Format == MESA_FORMAT_Z24_S8 ||
+          rb->Format == MESA_FORMAT_S8_Z24);
+
+   if (maskStencil) 
+      return GL_TRUE;
+
+   if (ctx->Scissor.Enabled &&
+       (ctx->Scissor.X != 0 ||
+        ctx->Scissor.Y != 0 ||
+        ctx->Scissor.Width < rb->Width ||
+        ctx->Scissor.Height < rb->Height))
+      return GL_TRUE;
+
+   /* This is correct, but it is necessary to look at the depth clear
+    * value held in the surface when it comes time to issue the clear,
+    * rather than taking depth and stencil clear values from the
+    * current state.
+    */
+   if (!ds_separate && isDS && ctx->DrawBuffer->Visual.depthBits > 0)
+      return GL_TRUE;
+
+   return GL_FALSE;
+}
+
+
+
+/**
+ * Called when we need to flush.
+ */
+void
+st_flush_clear(struct st_context *st)
+{
+   /* Release vertex buffer to avoid synchronous rendering if we were
+    * to map it in the next frame.
+    */
+   pipe_resource_reference(&st->clear.vbuf, NULL);
+   st->clear.vbuf_slot = 0;
+}
+ 
+
+
+/**
+ * Called via ctx->Driver.Clear()
+ */
+static void
+st_Clear(struct gl_context *ctx, GLbitfield mask)
+{
+   static const GLbitfield BUFFER_BITS_DS
+      = (BUFFER_BIT_DEPTH | BUFFER_BIT_STENCIL);
+   struct st_context *st = st_context(ctx);
+   struct gl_renderbuffer *depthRb
+      = ctx->DrawBuffer->Attachment[BUFFER_DEPTH].Renderbuffer;
+   struct gl_renderbuffer *stencilRb
+      = ctx->DrawBuffer->Attachment[BUFFER_STENCIL].Renderbuffer;
+   GLbitfield quad_buffers = 0x0;
+   GLbitfield clear_buffers = 0x0;
+   GLuint i;
+
+   /* This makes sure the pipe has the latest scissor, etc values */
+   st_validate_state( st );
+
+   if (mask & BUFFER_BITS_COLOR) {
+      for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
+         GLuint b = ctx->DrawBuffer->_ColorDrawBufferIndexes[i];
+
+         if (mask & (1 << b)) {
+            struct gl_renderbuffer *rb
+               = ctx->DrawBuffer->Attachment[b].Renderbuffer;
+            struct st_renderbuffer *strb = st_renderbuffer(rb);
+
+            if (!strb || !strb->surface)
+               continue;
+
+            if (check_clear_color_with_quad( ctx, rb ))
+               quad_buffers |= PIPE_CLEAR_COLOR;
+            else
+               clear_buffers |= PIPE_CLEAR_COLOR;
+         }
+      }
+   }
+
+   if ((mask & BUFFER_BITS_DS) == BUFFER_BITS_DS && depthRb == stencilRb) {
+      /* clearing combined depth + stencil */
+      struct st_renderbuffer *strb = st_renderbuffer(depthRb);
+
+      if (strb->surface) {
+         if (check_clear_depth_stencil_with_quad(ctx, depthRb))
+            quad_buffers |= PIPE_CLEAR_DEPTHSTENCIL;
+         else
+            clear_buffers |= PIPE_CLEAR_DEPTHSTENCIL;
+      }
+   }
+   else {
+      /* separate depth/stencil clears */
+      /* I don't think truly separate buffers are actually possible in gallium or hw? */
+      if (mask & BUFFER_BIT_DEPTH) {
+         struct st_renderbuffer *strb = st_renderbuffer(depthRb);
+
+         if (strb->surface) {
+            if (check_clear_depth_with_quad(ctx, depthRb,
+                                            st->clear.enable_ds_separate))
+               quad_buffers |= PIPE_CLEAR_DEPTH;
+            else
+               clear_buffers |= PIPE_CLEAR_DEPTH;
+         }
+      }
+      if (mask & BUFFER_BIT_STENCIL) {
+         struct st_renderbuffer *strb = st_renderbuffer(stencilRb);
+
+         if (strb->surface) {
+            if (check_clear_stencil_with_quad(ctx, stencilRb,
+                                              st->clear.enable_ds_separate))
+               quad_buffers |= PIPE_CLEAR_STENCIL;
+            else
+               clear_buffers |= PIPE_CLEAR_STENCIL;
+         }
+      }
+   }
+
+   /*
+    * If we're going to use clear_with_quad() for any reason, use it for
+    * everything possible.
+    */
+   if (quad_buffers) {
+      quad_buffers |= clear_buffers;
+      clear_with_quad(ctx,
+                      quad_buffers & PIPE_CLEAR_COLOR,
+                      quad_buffers & PIPE_CLEAR_DEPTH,
+                      quad_buffers & PIPE_CLEAR_STENCIL);
+   } else if (clear_buffers) {
+      /* driver cannot know it can clear everything if the buffer
+       * is a combined depth/stencil buffer but this wasn't actually
+       * required from the visual. Hence fix this up to avoid potential
+       * read-modify-write in the driver.
+       */
+      if ((clear_buffers & PIPE_CLEAR_DEPTHSTENCIL) &&
+          ((clear_buffers & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL) &&
+          (depthRb == stencilRb) &&
+          (ctx->DrawBuffer->Visual.depthBits == 0 ||
+           ctx->DrawBuffer->Visual.stencilBits == 0))
+         clear_buffers |= PIPE_CLEAR_DEPTHSTENCIL;
+      st->pipe->clear(st->pipe, clear_buffers, ctx->Color.ClearColor,
+                      ctx->Depth.Clear, ctx->Stencil.Clear);
+   }
+   if (mask & BUFFER_BIT_ACCUM)
+      st_clear_accum_buffer(ctx,
+                            ctx->DrawBuffer->Attachment[BUFFER_ACCUM].Renderbuffer);
+}
+
+
+void
+st_init_clear_functions(struct dd_function_table *functions)
+{
+   functions->Clear = st_Clear;
+}
diff --git a/mesalib/src/mesa/state_tracker/st_cb_drawpixels.c b/mesalib/src/mesa/state_tracker/st_cb_drawpixels.c
index 56c7e8581..07527002b 100644
--- a/mesalib/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/mesalib/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -1,1368 +1,1371 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Brian Paul
-  */
-
-#include "main/imports.h"
-#include "main/image.h"
-#include "main/bufferobj.h"
-#include "main/macros.h"
-#include "main/mfeatures.h"
-#include "main/mtypes.h"
-#include "main/pack.h"
-#include "main/texformat.h"
-#include "main/texstore.h"
-#include "program/program.h"
-#include "program/prog_print.h"
-#include "program/prog_instruction.h"
-
-#include "st_atom.h"
-#include "st_atom_constbuf.h"
-#include "st_cb_drawpixels.h"
-#include "st_cb_readpixels.h"
-#include "st_cb_fbo.h"
-#include "st_context.h"
-#include "st_debug.h"
-#include "st_format.h"
-#include "st_program.h"
-#include "st_texture.h"
-
-#include "pipe/p_context.h"
-#include "pipe/p_defines.h"
-#include "tgsi/tgsi_ureg.h"
-#include "util/u_draw_quad.h"
-#include "util/u_format.h"
-#include "util/u_inlines.h"
-#include "util/u_math.h"
-#include "util/u_tile.h"
-#include "cso_cache/cso_context.h"
-
-
-#if FEATURE_drawpix
-
-/**
- * Check if the given program is:
- * 0: MOVE result.color, fragment.color;
- * 1: END;
- */
-static GLboolean
-is_passthrough_program(const struct gl_fragment_program *prog)
-{
-   if (prog->Base.NumInstructions == 2) {
-      const struct prog_instruction *inst = prog->Base.Instructions;
-      if (inst[0].Opcode == OPCODE_MOV &&
-          inst[1].Opcode == OPCODE_END &&
-          inst[0].DstReg.File == PROGRAM_OUTPUT &&
-          inst[0].DstReg.Index == FRAG_RESULT_COLOR &&
-          inst[0].DstReg.WriteMask == WRITEMASK_XYZW &&
-          inst[0].SrcReg[0].File == PROGRAM_INPUT &&
-          inst[0].SrcReg[0].Index == FRAG_ATTRIB_COL0 &&
-          inst[0].SrcReg[0].Swizzle == SWIZZLE_XYZW) {
-         return GL_TRUE;
-      }
-   }
-   return GL_FALSE;
-}
-
-
-
-/**
- * Make fragment shader for glDraw/CopyPixels.  This shader is made
- * by combining the pixel transfer shader with the user-defined shader.
- * \param fpIn  the current/incoming fragment program
- * \param fpOut  returns the combined fragment program
- */
-void
-st_make_drawpix_fragment_program(struct st_context *st,
-                                 struct gl_fragment_program *fpIn,
-                                 struct gl_fragment_program **fpOut)
-{
-   struct gl_program *newProg;
-
-   if (is_passthrough_program(fpIn)) {
-      newProg = (struct gl_program *) _mesa_clone_fragment_program(st->ctx,
-                                             &st->pixel_xfer.program->Base);
-   }
-   else {
-#if 0
-      /* debug */
-      printf("Base program:\n");
-      _mesa_print_program(&fpIn->Base);
-      printf("DrawPix program:\n");
-      _mesa_print_program(&st->pixel_xfer.program->Base.Base);
-#endif
-      newProg = _mesa_combine_programs(st->ctx,
-                                       &st->pixel_xfer.program->Base.Base,
-                                       &fpIn->Base);
-   }
-
-#if 0
-   /* debug */
-   printf("Combined DrawPixels program:\n");
-   _mesa_print_program(newProg);
-   printf("InputsRead: 0x%x\n", newProg->InputsRead);
-   printf("OutputsWritten: 0x%x\n", newProg->OutputsWritten);
-   _mesa_print_parameter_list(newProg->Parameters);
-#endif
-
-   *fpOut = (struct gl_fragment_program *) newProg;
-}
-
-
-/**
- * Create fragment program that does a TEX() instruction to get a Z and/or
- * stencil value value, then writes to FRAG_RESULT_DEPTH/FRAG_RESULT_STENCIL.
- * Used for glDrawPixels(GL_DEPTH_COMPONENT / GL_STENCIL_INDEX).
- * Pass fragment color through as-is.
- * \return pointer to the gl_fragment program
- */
-struct gl_fragment_program *
-st_make_drawpix_z_stencil_program(struct st_context *st,
-                                  GLboolean write_depth,
-                                  GLboolean write_stencil)
-{
-   struct gl_context *ctx = st->ctx;
-   struct gl_program *p;
-   struct gl_fragment_program *fp;
-   GLuint ic = 0;
-   const GLuint shaderIndex = write_depth * 2 + write_stencil;
-
-   assert(shaderIndex < Elements(st->drawpix.shaders));
-
-   if (st->drawpix.shaders[shaderIndex]) {
-      /* already have the proper shader */
-      return st->drawpix.shaders[shaderIndex];
-   }
-
-   /*
-    * Create shader now
-    */
-   p = ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0);
-   if (!p)
-      return NULL;
-
-   p->NumInstructions = write_depth ? 2 : 1;
-   p->NumInstructions += write_stencil ? 1 : 0;
-
-   p->Instructions = _mesa_alloc_instructions(p->NumInstructions);
-   if (!p->Instructions) {
-      ctx->Driver.DeleteProgram(ctx, p);
-      return NULL;
-   }
-   _mesa_init_instructions(p->Instructions, p->NumInstructions);
-
-   if (write_depth) {
-      /* TEX result.depth, fragment.texcoord[0], texture[0], 2D; */
-      p->Instructions[ic].Opcode = OPCODE_TEX;
-      p->Instructions[ic].DstReg.File = PROGRAM_OUTPUT;
-      p->Instructions[ic].DstReg.Index = FRAG_RESULT_DEPTH;
-      p->Instructions[ic].DstReg.WriteMask = WRITEMASK_Z;
-      p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT;
-      p->Instructions[ic].SrcReg[0].Index = FRAG_ATTRIB_TEX0;
-      p->Instructions[ic].TexSrcUnit = 0;
-      p->Instructions[ic].TexSrcTarget = TEXTURE_2D_INDEX;
-      ic++;
-   }
-
-   if (write_stencil) {
-      /* TEX result.stencil, fragment.texcoord[0], texture[0], 2D; */
-      p->Instructions[ic].Opcode = OPCODE_TEX;
-      p->Instructions[ic].DstReg.File = PROGRAM_OUTPUT;
-      p->Instructions[ic].DstReg.Index = FRAG_RESULT_STENCIL;
-      p->Instructions[ic].DstReg.WriteMask = WRITEMASK_Y;
-      p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT;
-      p->Instructions[ic].SrcReg[0].Index = FRAG_ATTRIB_TEX0;
-      p->Instructions[ic].TexSrcUnit = 1;
-      p->Instructions[ic].TexSrcTarget = TEXTURE_2D_INDEX;
-      ic++;
-   }
-
-   /* END; */
-   p->Instructions[ic++].Opcode = OPCODE_END;
-
-   assert(ic == p->NumInstructions);
-
-   p->InputsRead = FRAG_BIT_TEX0 | FRAG_BIT_COL0;
-   p->OutputsWritten = 0;
-   if (write_depth)
-      p->OutputsWritten |= (1 << FRAG_RESULT_DEPTH);
-   if (write_stencil)
-      p->OutputsWritten |= (1 << FRAG_RESULT_STENCIL);
-
-   p->SamplersUsed =  0x1;  /* sampler 0 (bit 0) is used */
-   if (write_stencil)
-      p->SamplersUsed |= 1 << 1;
-
-   fp = (struct gl_fragment_program *) p;
-
-   /* save the new shader */
-   st->drawpix.shaders[shaderIndex] = fp;
-
-   return fp;
-}
-
-
-/**
- * Create a simple vertex shader that just passes through the
- * vertex position and texcoord (and optionally, color).
- */
-static void *
-make_passthrough_vertex_shader(struct st_context *st, 
-                               GLboolean passColor)
-{
-   if (!st->drawpix.vert_shaders[passColor]) {
-      struct ureg_program *ureg = ureg_create( TGSI_PROCESSOR_VERTEX );
-
-      if (ureg == NULL)
-         return NULL;
-
-      /* MOV result.pos, vertex.pos; */
-      ureg_MOV(ureg, 
-               ureg_DECL_output( ureg, TGSI_SEMANTIC_POSITION, 0 ),
-               ureg_DECL_vs_input( ureg, 0 ));
-      
-      /* MOV result.texcoord0, vertex.attr[1]; */
-      ureg_MOV(ureg, 
-               ureg_DECL_output( ureg, TGSI_SEMANTIC_GENERIC, 0 ),
-               ureg_DECL_vs_input( ureg, 1 ));
-      
-      if (passColor) {
-         /* MOV result.color0, vertex.attr[2]; */
-         ureg_MOV(ureg, 
-                  ureg_DECL_output( ureg, TGSI_SEMANTIC_COLOR, 0 ),
-                  ureg_DECL_vs_input( ureg, 2 ));
-      }
-
-      ureg_END( ureg );
-      
-      st->drawpix.vert_shaders[passColor] = 
-         ureg_create_shader_and_destroy( ureg, st->pipe );
-   }
-
-   return st->drawpix.vert_shaders[passColor];
-}
-
-
-/**
- * Return a texture base format for drawing/copying an image
- * of the given format.
- */
-static GLenum
-base_format(GLenum format)
-{
-   switch (format) {
-   case GL_DEPTH_COMPONENT:
-      return GL_DEPTH_COMPONENT;
-   case GL_DEPTH_STENCIL:
-      return GL_DEPTH_STENCIL;
-   case GL_STENCIL_INDEX:
-      return GL_STENCIL_INDEX;
-   default:
-      return GL_RGBA;
-   }
-}
-
-
-/**
- * Return a texture internalFormat for drawing/copying an image
- * of the given format and type.
- */
-static GLenum
-internal_format(GLenum format, GLenum type)
-{
-   switch (format) {
-   case GL_DEPTH_COMPONENT:
-      return GL_DEPTH_COMPONENT;
-   case GL_DEPTH_STENCIL:
-      return GL_DEPTH_STENCIL;
-   case GL_STENCIL_INDEX:
-      return GL_STENCIL_INDEX;
-   default:
-      if (_mesa_is_integer_format(format)) {
-         switch (type) {
-         case GL_BYTE:
-            return GL_RGBA8I;
-         case GL_UNSIGNED_BYTE:
-            return GL_RGBA8UI;
-         case GL_SHORT:
-            return GL_RGBA16I;
-         case GL_UNSIGNED_SHORT:
-            return GL_RGBA16UI;
-         case GL_INT:
-            return GL_RGBA32I;
-         case GL_UNSIGNED_INT:
-            return GL_RGBA32UI;
-         default:
-            assert(0 && "Unexpected type in internal_format()");
-            return GL_RGBA_INTEGER;
-         }
-      }
-      else {
-         return GL_RGBA;
-      }
-   }
-}
-
-
-/**
- * Create a temporary texture to hold an image of the given size.
- * If width, height are not POT and the driver only handles POT textures,
- * allocate the next larger size of texture that is POT.
- */
-static struct pipe_resource *
-alloc_texture(struct st_context *st, GLsizei width, GLsizei height,
-              enum pipe_format texFormat)
-{
-   struct pipe_resource *pt;
-
-   pt = st_texture_create(st, st->internal_target, texFormat, 0,
-                          width, height, 1, 1, PIPE_BIND_SAMPLER_VIEW);
-
-   return pt;
-}
-
-
-/**
- * Make texture containing an image for glDrawPixels image.
- * If 'pixels' is NULL, leave the texture image data undefined.
- */
-static struct pipe_resource *
-make_texture(struct st_context *st,
-	     GLsizei width, GLsizei height, GLenum format, GLenum type,
-	     const struct gl_pixelstore_attrib *unpack,
-	     const GLvoid *pixels)
-{
-   struct gl_context *ctx = st->ctx;
-   struct pipe_context *pipe = st->pipe;
-   gl_format mformat;
-   struct pipe_resource *pt;
-   enum pipe_format pipeFormat;
-   GLuint cpp;
-   GLenum baseFormat, intFormat;
-
-   baseFormat = base_format(format);
-   intFormat = internal_format(format, type);
-
-   mformat = st_ChooseTextureFormat_renderable(ctx, intFormat,
-                                               format, type, GL_FALSE);
-   assert(mformat);
-
-   pipeFormat = st_mesa_format_to_pipe_format(mformat);
-   assert(pipeFormat);
-   cpp = util_format_get_blocksize(pipeFormat);
-
-   pixels = _mesa_map_pbo_source(ctx, unpack, pixels);
-   if (!pixels)
-      return NULL;
-
-   /* alloc temporary texture */
-   pt = alloc_texture(st, width, height, pipeFormat);
-   if (!pt) {
-      _mesa_unmap_pbo_source(ctx, unpack);
-      return NULL;
-   }
-
-   {
-      struct pipe_transfer *transfer;
-      static const GLuint dstImageOffsets = 0;
-      GLboolean success;
-      GLubyte *dest;
-      const GLbitfield imageTransferStateSave = ctx->_ImageTransferState;
-
-      /* we'll do pixel transfer in a fragment shader */
-      ctx->_ImageTransferState = 0x0;
-
-      transfer = pipe_get_transfer(st->pipe, pt, 0, 0,
-                                   PIPE_TRANSFER_WRITE, 0, 0,
-                                   width, height);
-
-      /* map texture transfer */
-      dest = pipe_transfer_map(pipe, transfer);
-
-
-      /* Put image into texture transfer.
-       * Note that the image is actually going to be upside down in
-       * the texture.  We deal with that with texcoords.
-       */
-      success = _mesa_texstore(ctx, 2,           /* dims */
-                               baseFormat,       /* baseInternalFormat */
-                               mformat,          /* gl_format */
-                               dest,             /* dest */
-                               0, 0, 0,          /* dstX/Y/Zoffset */
-                               transfer->stride, /* dstRowStride, bytes */
-                               &dstImageOffsets, /* dstImageOffsets */
-                               width, height, 1, /* size */
-                               format, type,     /* src format/type */
-                               pixels,           /* data source */
-                               unpack);
-
-      /* unmap */
-      pipe_transfer_unmap(pipe, transfer);
-      pipe->transfer_destroy(pipe, transfer);
-
-      assert(success);
-
-      /* restore */
-      ctx->_ImageTransferState = imageTransferStateSave;
-   }
-
-   _mesa_unmap_pbo_source(ctx, unpack);
-
-   return pt;
-}
-
-
-/**
- * Draw quad with texcoords and optional color.
- * Coords are gallium window coords with y=0=top.
- * \param color  may be null
- * \param invertTex  if true, flip texcoords vertically
- */
-static void
-draw_quad(struct gl_context *ctx, GLfloat x0, GLfloat y0, GLfloat z,
-          GLfloat x1, GLfloat y1, const GLfloat *color,
-          GLboolean invertTex, GLfloat maxXcoord, GLfloat maxYcoord)
-{
-   struct st_context *st = st_context(ctx);
-   struct pipe_context *pipe = st->pipe;
-   GLfloat verts[4][3][4]; /* four verts, three attribs, XYZW */
-
-   /* setup vertex data */
-   {
-      const struct gl_framebuffer *fb = st->ctx->DrawBuffer;
-      const GLfloat fb_width = (GLfloat) fb->Width;
-      const GLfloat fb_height = (GLfloat) fb->Height;
-      const GLfloat clip_x0 = x0 / fb_width * 2.0f - 1.0f;
-      const GLfloat clip_y0 = y0 / fb_height * 2.0f - 1.0f;
-      const GLfloat clip_x1 = x1 / fb_width * 2.0f - 1.0f;
-      const GLfloat clip_y1 = y1 / fb_height * 2.0f - 1.0f;
-      const GLfloat sLeft = 0.0f, sRight = maxXcoord;
-      const GLfloat tTop = invertTex ? maxYcoord : 0.0f;
-      const GLfloat tBot = invertTex ? 0.0f : maxYcoord;
-      GLuint i;
-
-      /* upper-left */
-      verts[0][0][0] = clip_x0;    /* v[0].attr[0].x */
-      verts[0][0][1] = clip_y0;    /* v[0].attr[0].y */
-
-      /* upper-right */
-      verts[1][0][0] = clip_x1;
-      verts[1][0][1] = clip_y0;
-
-      /* lower-right */
-      verts[2][0][0] = clip_x1;
-      verts[2][0][1] = clip_y1;
-
-      /* lower-left */
-      verts[3][0][0] = clip_x0;
-      verts[3][0][1] = clip_y1;
-
-      verts[0][1][0] = sLeft; /* v[0].attr[1].S */
-      verts[0][1][1] = tTop;  /* v[0].attr[1].T */
-      verts[1][1][0] = sRight;
-      verts[1][1][1] = tTop;
-      verts[2][1][0] = sRight;
-      verts[2][1][1] = tBot;
-      verts[3][1][0] = sLeft;
-      verts[3][1][1] = tBot;
-
-      /* same for all verts: */
-      if (color) {
-         for (i = 0; i < 4; i++) {
-            verts[i][0][2] = z;         /* v[i].attr[0].z */
-            verts[i][0][3] = 1.0f;      /* v[i].attr[0].w */
-            verts[i][2][0] = color[0];  /* v[i].attr[2].r */
-            verts[i][2][1] = color[1];  /* v[i].attr[2].g */
-            verts[i][2][2] = color[2];  /* v[i].attr[2].b */
-            verts[i][2][3] = color[3];  /* v[i].attr[2].a */
-            verts[i][1][2] = 0.0f;      /* v[i].attr[1].R */
-            verts[i][1][3] = 1.0f;      /* v[i].attr[1].Q */
-         }
-      }
-      else {
-         for (i = 0; i < 4; i++) {
-            verts[i][0][2] = z;    /*Z*/
-            verts[i][0][3] = 1.0f; /*W*/
-            verts[i][1][2] = 0.0f; /*R*/
-            verts[i][1][3] = 1.0f; /*Q*/
-         }
-      }
-   }
-
-   {
-      struct pipe_resource *buf;
-
-      /* allocate/load buffer object with vertex data */
-      buf = pipe_buffer_create(pipe->screen,
-			       PIPE_BIND_VERTEX_BUFFER,
-                               sizeof(verts));
-      pipe_buffer_write(st->pipe, buf, 0, sizeof(verts), verts);
-
-      util_draw_vertex_buffer(pipe, buf, 0,
-                              PIPE_PRIM_QUADS,
-                              4,  /* verts */
-                              3); /* attribs/vert */
-      pipe_resource_reference(&buf, NULL);
-   }
-}
-
-
-
-static void
-draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
-                   GLsizei width, GLsizei height,
-                   GLfloat zoomX, GLfloat zoomY,
-                   struct pipe_sampler_view **sv,
-                   int num_sampler_view,
-                   void *driver_vp,
-                   void *driver_fp,
-                   const GLfloat *color,
-                   GLboolean invertTex,
-                   GLboolean write_depth, GLboolean write_stencil)
-{
-   struct st_context *st = st_context(ctx);
-   struct pipe_context *pipe = st->pipe;
-   struct cso_context *cso = st->cso_context;
-   GLfloat x0, y0, x1, y1;
-   GLsizei maxSize;
-   boolean normalized = sv[0]->texture->target != PIPE_TEXTURE_RECT;
-
-   /* limit checks */
-   /* XXX if DrawPixels image is larger than max texture size, break
-    * it up into chunks.
-    */
-   maxSize = 1 << (pipe->screen->get_param(pipe->screen,
-                                        PIPE_CAP_MAX_TEXTURE_2D_LEVELS) - 1);
-   assert(width <= maxSize);
-   assert(height <= maxSize);
-
-   cso_save_rasterizer(cso);
-   cso_save_viewport(cso);
-   cso_save_samplers(cso);
-   cso_save_fragment_sampler_views(cso);
-   cso_save_fragment_shader(cso);
-   cso_save_vertex_shader(cso);
-   cso_save_vertex_elements(cso);
-   if (write_stencil) {
-      cso_save_depth_stencil_alpha(cso);
-      cso_save_blend(cso);
-   }
-
-   /* rasterizer state: just scissor */
-   {
-      struct pipe_rasterizer_state rasterizer;
-      memset(&rasterizer, 0, sizeof(rasterizer));
-      rasterizer.gl_rasterization_rules = 1;
-      rasterizer.scissor = ctx->Scissor.Enabled;
-      cso_set_rasterizer(cso, &rasterizer);
-   }
-
-   if (write_stencil) {
-      /* Stencil writing bypasses the normal fragment pipeline to
-       * disable color writing and set stencil test to always pass.
-       */
-      struct pipe_depth_stencil_alpha_state dsa;
-      struct pipe_blend_state blend;
-
-      /* depth/stencil */
-      memset(&dsa, 0, sizeof(dsa));
-      dsa.stencil[0].enabled = 1;
-      dsa.stencil[0].func = PIPE_FUNC_ALWAYS;
-      dsa.stencil[0].writemask = ctx->Stencil.WriteMask[0] & 0xff;
-      dsa.stencil[0].zpass_op = PIPE_STENCIL_OP_REPLACE;
-      if (write_depth) {
-         /* writing depth+stencil: depth test always passes */
-         dsa.depth.enabled = 1;
-         dsa.depth.writemask = ctx->Depth.Mask;
-         dsa.depth.func = PIPE_FUNC_ALWAYS;
-      }
-      cso_set_depth_stencil_alpha(cso, &dsa);
-
-      /* blend (colormask) */
-      memset(&blend, 0, sizeof(blend));
-      cso_set_blend(cso, &blend);
-   }
-
-   /* fragment shader state: TEX lookup program */
-   cso_set_fragment_shader_handle(cso, driver_fp);
-
-   /* vertex shader state: position + texcoord pass-through */
-   cso_set_vertex_shader_handle(cso, driver_vp);
-
-
-   /* texture sampling state: */
-   {
-      struct pipe_sampler_state sampler;
-      memset(&sampler, 0, sizeof(sampler));
-      sampler.wrap_s = PIPE_TEX_WRAP_CLAMP;
-      sampler.wrap_t = PIPE_TEX_WRAP_CLAMP;
-      sampler.wrap_r = PIPE_TEX_WRAP_CLAMP;
-      sampler.min_img_filter = PIPE_TEX_FILTER_NEAREST;
-      sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
-      sampler.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
-      sampler.normalized_coords = normalized;
-
-      cso_single_sampler(cso, 0, &sampler);
-      if (num_sampler_view > 1) {
-         cso_single_sampler(cso, 1, &sampler);
-      }
-      cso_single_sampler_done(cso);
-   }
-
-   /* viewport state: viewport matching window dims */
-   {
-      const float w = (float) ctx->DrawBuffer->Width;
-      const float h = (float) ctx->DrawBuffer->Height;
-      struct pipe_viewport_state vp;
-      vp.scale[0] =  0.5f * w;
-      vp.scale[1] = -0.5f * h;
-      vp.scale[2] = 0.5f;
-      vp.scale[3] = 1.0f;
-      vp.translate[0] = 0.5f * w;
-      vp.translate[1] = 0.5f * h;
-      vp.translate[2] = 0.5f;
-      vp.translate[3] = 0.0f;
-      cso_set_viewport(cso, &vp);
-   }
-
-   cso_set_vertex_elements(cso, 3, st->velems_util_draw);
-
-   /* texture state: */
-   cso_set_fragment_sampler_views(cso, num_sampler_view, sv);
-
-   /* Compute Gallium window coords (y=0=top) with pixel zoom.
-    * Recall that these coords are transformed by the current
-    * vertex shader and viewport transformation.
-    */
-   if (st_fb_orientation(ctx->DrawBuffer) == Y_0_BOTTOM) {
-      y = ctx->DrawBuffer->Height - (int) (y + height * ctx->Pixel.ZoomY);
-      invertTex = !invertTex;
-   }
-
-   x0 = (GLfloat) x;
-   x1 = x + width * ctx->Pixel.ZoomX;
-   y0 = (GLfloat) y;
-   y1 = y + height * ctx->Pixel.ZoomY;
-
-   /* convert Z from [0,1] to [-1,-1] to match viewport Z scale/bias */
-   z = z * 2.0 - 1.0;
-
-   draw_quad(ctx, x0, y0, z, x1, y1, color, invertTex,
-             normalized ? ((GLfloat) width / sv[0]->texture->width0) : (GLfloat)width,
-             normalized ? ((GLfloat) height / sv[0]->texture->height0) : (GLfloat)height);
-
-   /* restore state */
-   cso_restore_rasterizer(cso);
-   cso_restore_viewport(cso);
-   cso_restore_samplers(cso);
-   cso_restore_fragment_sampler_views(cso);
-   cso_restore_fragment_shader(cso);
-   cso_restore_vertex_shader(cso);
-   cso_restore_vertex_elements(cso);
-   if (write_stencil) {
-      cso_restore_depth_stencil_alpha(cso);
-      cso_restore_blend(cso);
-   }
-}
-
-
-/**
- * Software fallback to do glDrawPixels(GL_STENCIL_INDEX) when we
- * can't use a fragment shader to write stencil values.
- */
-static void
-draw_stencil_pixels(struct gl_context *ctx, GLint x, GLint y,
-                    GLsizei width, GLsizei height, GLenum format, GLenum type,
-                    const struct gl_pixelstore_attrib *unpack,
-                    const GLvoid *pixels)
-{
-   struct st_context *st = st_context(ctx);
-   struct pipe_context *pipe = st->pipe;
-   struct st_renderbuffer *strb;
-   enum pipe_transfer_usage usage;
-   struct pipe_transfer *pt;
-   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0 || ctx->Pixel.ZoomY != 1.0;
-   GLint skipPixels;
-   ubyte *stmap;
-   struct gl_pixelstore_attrib clippedUnpack = *unpack;
-
-   if (!zoom) {
-      if (!_mesa_clip_drawpixels(ctx, &x, &y, &width, &height,
-                                 &clippedUnpack)) {
-         /* totally clipped */
-         return;
-      }
-   }
-
-   strb = st_renderbuffer(ctx->DrawBuffer->
-                          Attachment[BUFFER_STENCIL].Renderbuffer);
-
-   if (st_fb_orientation(ctx->DrawBuffer) == Y_0_TOP) {
-      y = ctx->DrawBuffer->Height - y - height;
-   }
-
-   if(format != GL_DEPTH_STENCIL && 
-      util_format_get_component_bits(strb->format,
-                                     UTIL_FORMAT_COLORSPACE_ZS, 0) != 0)
-      usage = PIPE_TRANSFER_READ_WRITE;
-   else
-      usage = PIPE_TRANSFER_WRITE;
-
-   pt = pipe_get_transfer(st_context(ctx)->pipe, strb->texture, 0, 0,
-                                     usage, x, y,
-                                     width, height);
-
-   stmap = pipe_transfer_map(pipe, pt);
-
-   pixels = _mesa_map_pbo_source(ctx, &clippedUnpack, pixels);
-   assert(pixels);
-
-   /* if width > MAX_WIDTH, have to process image in chunks */
-   skipPixels = 0;
-   while (skipPixels < width) {
-      const GLint spanX = skipPixels;
-      const GLint spanWidth = MIN2(width - skipPixels, MAX_WIDTH);
-      GLint row;
-      for (row = 0; row < height; row++) {
-         GLubyte sValues[MAX_WIDTH];
-         GLuint zValues[MAX_WIDTH];
-         GLenum destType = GL_UNSIGNED_BYTE;
-         const GLvoid *source = _mesa_image_address2d(&clippedUnpack, pixels,
-                                                      width, height,
-                                                      format, type,
-                                                      row, skipPixels);
-         _mesa_unpack_stencil_span(ctx, spanWidth, destType, sValues,
-                                   type, source, &clippedUnpack,
-                                   ctx->_ImageTransferState);
-
-         if (format == GL_DEPTH_STENCIL) {
-            _mesa_unpack_depth_span(ctx, spanWidth, GL_UNSIGNED_INT, zValues,
-                                    (1 << 24) - 1, type, source,
-                                    &clippedUnpack);
-         }
-
-         if (zoom) {
-            _mesa_problem(ctx, "Gallium glDrawPixels(GL_STENCIL) with "
-                          "zoom not complete");
-         }
-
-         {
-            GLint spanY;
-
-            if (st_fb_orientation(ctx->DrawBuffer) == Y_0_TOP) {
-               spanY = height - row - 1;
-            }
-            else {
-               spanY = row;
-            }
-
-            /* now pack the stencil (and Z) values in the dest format */
-            switch (pt->resource->format) {
-            case PIPE_FORMAT_S8_USCALED:
-               {
-                  ubyte *dest = stmap + spanY * pt->stride + spanX;
-                  assert(usage == PIPE_TRANSFER_WRITE);
-                  memcpy(dest, sValues, spanWidth);
-               }
-               break;
-            case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
-               if (format == GL_DEPTH_STENCIL) {
-                  uint *dest = (uint *) (stmap + spanY * pt->stride + spanX*4);
-                  GLint k;
-                  assert(usage == PIPE_TRANSFER_WRITE);
-                  for (k = 0; k < spanWidth; k++) {
-                     dest[k] = zValues[k] | (sValues[k] << 24);
-                  }
-               }
-               else {
-                  uint *dest = (uint *) (stmap + spanY * pt->stride + spanX*4);
-                  GLint k;
-                  assert(usage == PIPE_TRANSFER_READ_WRITE);
-                  for (k = 0; k < spanWidth; k++) {
-                     dest[k] = (dest[k] & 0xffffff) | (sValues[k] << 24);
-                  }
-               }
-               break;
-            case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
-               if (format == GL_DEPTH_STENCIL) {
-                  uint *dest = (uint *) (stmap + spanY * pt->stride + spanX*4);
-                  GLint k;
-                  assert(usage == PIPE_TRANSFER_WRITE);
-                  for (k = 0; k < spanWidth; k++) {
-                     dest[k] = (zValues[k] << 8) | (sValues[k] & 0xff);
-                  }
-               }
-               else {
-                  uint *dest = (uint *) (stmap + spanY * pt->stride + spanX*4);
-                  GLint k;
-                  assert(usage == PIPE_TRANSFER_READ_WRITE);
-                  for (k = 0; k < spanWidth; k++) {
-                     dest[k] = (dest[k] & 0xffffff00) | (sValues[k] & 0xff);
-                  }
-               }
-               break;
-            default:
-               assert(0);
-            }
-         }
-      }
-      skipPixels += spanWidth;
-   }
-
-   _mesa_unmap_pbo_source(ctx, &clippedUnpack);
-
-   /* unmap the stencil buffer */
-   pipe_transfer_unmap(pipe, pt);
-   pipe->transfer_destroy(pipe, pt);
-}
-
-
-/**
- * Get fragment program variant for a glDrawPixels or glCopyPixels
- * command for RGBA data.
- */
-static struct st_fp_variant *
-get_color_fp_variant(struct st_context *st)
-{
-   struct gl_context *ctx = st->ctx;
-   struct st_fp_variant_key key;
-   struct st_fp_variant *fpv;
-
-   memset(&key, 0, sizeof(key));
-
-   key.st = st;
-   key.drawpixels = 1;
-   key.scaleAndBias = (ctx->Pixel.RedBias != 0.0 ||
-                       ctx->Pixel.RedScale != 1.0 ||
-                       ctx->Pixel.GreenBias != 0.0 ||
-                       ctx->Pixel.GreenScale != 1.0 ||
-                       ctx->Pixel.BlueBias != 0.0 ||
-                       ctx->Pixel.BlueScale != 1.0 ||
-                       ctx->Pixel.AlphaBias != 0.0 ||
-                       ctx->Pixel.AlphaScale != 1.0);
-   key.pixelMaps = ctx->Pixel.MapColorFlag;
-
-   fpv = st_get_fp_variant(st, st->fp, &key);
-
-   return fpv;
-}
-
-
-/**
- * Get fragment program variant for a glDrawPixels or glCopyPixels
- * command for depth/stencil data.
- */
-static struct st_fp_variant *
-get_depth_stencil_fp_variant(struct st_context *st, GLboolean write_depth,
-                             GLboolean write_stencil)
-{
-   struct st_fp_variant_key key;
-   struct st_fp_variant *fpv;
-
-   memset(&key, 0, sizeof(key));
-
-   key.st = st;
-   key.drawpixels = 1;
-   key.drawpixels_z = write_depth;
-   key.drawpixels_stencil = write_stencil;
-
-   fpv = st_get_fp_variant(st, st->fp, &key);
-
-   return fpv;
-}
-
-
-/**
- * Called via ctx->Driver.DrawPixels()
- */
-static void
-st_DrawPixels(struct gl_context *ctx, GLint x, GLint y,
-              GLsizei width, GLsizei height,
-              GLenum format, GLenum type,
-              const struct gl_pixelstore_attrib *unpack, const GLvoid *pixels)
-{
-   void *driver_vp, *driver_fp;
-   struct st_context *st = st_context(ctx);
-   const GLfloat *color;
-   struct pipe_context *pipe = st->pipe;
-   GLboolean write_stencil = GL_FALSE, write_depth = GL_FALSE;
-   struct pipe_sampler_view *sv[2];
-   int num_sampler_view = 1;
-   enum pipe_format stencil_format = PIPE_FORMAT_NONE;
-   struct st_fp_variant *fpv;
-
-   if (format == GL_DEPTH_STENCIL)
-      write_stencil = write_depth = GL_TRUE;
-   else if (format == GL_STENCIL_INDEX)
-      write_stencil = GL_TRUE;
-   else if (format == GL_DEPTH_COMPONENT)
-      write_depth = GL_TRUE;
-
-   if (write_stencil) {
-      enum pipe_format tex_format;
-      /* can we write to stencil if not fallback */
-      if (!pipe->screen->get_param(pipe->screen, PIPE_CAP_SHADER_STENCIL_EXPORT))
-	 goto stencil_fallback;
-      
-      tex_format = st_choose_format(st->pipe->screen, base_format(format),
-                                    PIPE_TEXTURE_2D,
-				    0, PIPE_BIND_SAMPLER_VIEW);
-      if (tex_format == PIPE_FORMAT_Z24_UNORM_S8_USCALED)
-	 stencil_format = PIPE_FORMAT_X24S8_USCALED;
-      else if (tex_format == PIPE_FORMAT_S8_USCALED_Z24_UNORM)
-	 stencil_format = PIPE_FORMAT_S8X24_USCALED;
-      else
-	 stencil_format = PIPE_FORMAT_S8_USCALED;
-      if (stencil_format == PIPE_FORMAT_NONE)
-	 goto stencil_fallback;
-   }
-
-   /* Mesa state should be up to date by now */
-   assert(ctx->NewState == 0x0);
-
-   st_validate_state(st);
-
-   /*
-    * Get vertex/fragment shaders
-    */
-   if (write_depth || write_stencil) {
-      fpv = get_depth_stencil_fp_variant(st, write_depth, write_stencil);
-
-      driver_fp = fpv->driver_shader;
-
-      driver_vp = make_passthrough_vertex_shader(st, GL_TRUE);
-
-      color = ctx->Current.RasterColor;
-   }
-   else {
-      fpv = get_color_fp_variant(st);
-
-      driver_fp = fpv->driver_shader;
-
-      driver_vp = make_passthrough_vertex_shader(st, GL_FALSE);
-
-      color = NULL;
-      if (st->pixel_xfer.pixelmap_enabled) {
-	  sv[1] = st->pixel_xfer.pixelmap_sampler_view;
-	  num_sampler_view++;
-      }
-   }
-
-   /* update fragment program constants */
-   st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
-
-   /* draw with textured quad */
-   {
-      struct pipe_resource *pt
-         = make_texture(st, width, height, format, type, unpack, pixels);
-      if (pt) {
-         sv[0] = st_create_texture_sampler_view(st->pipe, pt);
-
-         if (sv[0]) {
-	    if (write_stencil) {
-	       sv[1] = st_create_texture_sampler_view_format(st->pipe, pt,
-                                                             stencil_format);
-	       num_sampler_view++;
-	    }
-
-            draw_textured_quad(ctx, x, y, ctx->Current.RasterPos[2],
-                               width, height,
-                               ctx->Pixel.ZoomX, ctx->Pixel.ZoomY,
-                               sv,
-                               num_sampler_view,
-                               driver_vp,
-                               driver_fp,
-                               color, GL_FALSE, write_depth, write_stencil);
-            pipe_sampler_view_reference(&sv[0], NULL);
-            if (num_sampler_view > 1)
-               pipe_sampler_view_reference(&sv[1], NULL);
-         }
-         pipe_resource_reference(&pt, NULL);
-      }
-   }
-   return;
-
-stencil_fallback:
-   draw_stencil_pixels(ctx, x, y, width, height, format, type,
-		       unpack, pixels);
-}
-
-
-
-/**
- * Software fallback for glCopyPixels(GL_STENCIL).
- */
-static void
-copy_stencil_pixels(struct gl_context *ctx, GLint srcx, GLint srcy,
-                    GLsizei width, GLsizei height,
-                    GLint dstx, GLint dsty)
-{
-   struct st_renderbuffer *rbDraw;
-   struct pipe_context *pipe = st_context(ctx)->pipe;
-   enum pipe_transfer_usage usage;
-   struct pipe_transfer *ptDraw;
-   ubyte *drawMap;
-   ubyte *buffer;
-   int i;
-
-   buffer = malloc(width * height * sizeof(ubyte));
-   if (!buffer) {
-      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCopyPixels(stencil)");
-      return;
-   }
-
-   /* Get the dest renderbuffer.  If there's a wrapper, use the
-    * underlying renderbuffer.
-    */
-   rbDraw = st_renderbuffer(ctx->DrawBuffer->_StencilBuffer);
-   if (rbDraw->Base.Wrapped)
-      rbDraw = st_renderbuffer(rbDraw->Base.Wrapped);
-
-   /* this will do stencil pixel transfer ops */
-   st_read_stencil_pixels(ctx, srcx, srcy, width, height,
-                          GL_STENCIL_INDEX, GL_UNSIGNED_BYTE,
-                          &ctx->DefaultPacking, buffer);
-
-   if (0) {
-      /* debug code: dump stencil values */
-      GLint row, col;
-      for (row = 0; row < height; row++) {
-         printf("%3d: ", row);
-         for (col = 0; col < width; col++) {
-            printf("%02x ", buffer[col + row * width]);
-         }
-         printf("\n");
-      }
-   }
-
-   if (util_format_get_component_bits(rbDraw->format,
-                                     UTIL_FORMAT_COLORSPACE_ZS, 0) != 0)
-      usage = PIPE_TRANSFER_READ_WRITE;
-   else
-      usage = PIPE_TRANSFER_WRITE;
-
-   if (st_fb_orientation(ctx->DrawBuffer) == Y_0_TOP) {
-      dsty = rbDraw->Base.Height - dsty - height;
-   }
-
-   ptDraw = pipe_get_transfer(st_context(ctx)->pipe,
-                              rbDraw->texture, 0, 0,
-                              usage, dstx, dsty,
-                              width, height);
-
-   assert(util_format_get_blockwidth(ptDraw->resource->format) == 1);
-   assert(util_format_get_blockheight(ptDraw->resource->format) == 1);
-
-   /* map the stencil buffer */
-   drawMap = pipe_transfer_map(pipe, ptDraw);
-
-   /* draw */
-   /* XXX PixelZoom not handled yet */
-   for (i = 0; i < height; i++) {
-      ubyte *dst;
-      const ubyte *src;
-      int y;
-
-      y = i;
-
-      if (st_fb_orientation(ctx->DrawBuffer) == Y_0_TOP) {
-         y = height - y - 1;
-      }
-
-      dst = drawMap + y * ptDraw->stride;
-      src = buffer + i * width;
-
-      switch (ptDraw->resource->format) {
-      case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
-         {
-            uint *dst4 = (uint *) dst;
-            int j;
-            assert(usage == PIPE_TRANSFER_READ_WRITE);
-            for (j = 0; j < width; j++) {
-               *dst4 = (*dst4 & 0xffffff) | (src[j] << 24);
-               dst4++;
-            }
-         }
-         break;
-      case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
-         {
-            uint *dst4 = (uint *) dst;
-            int j;
-            assert(usage == PIPE_TRANSFER_READ_WRITE);
-            for (j = 0; j < width; j++) {
-               *dst4 = (*dst4 & 0xffffff00) | (src[j] & 0xff);
-               dst4++;
-            }
-         }
-         break;
-      case PIPE_FORMAT_S8_USCALED:
-         assert(usage == PIPE_TRANSFER_WRITE);
-         memcpy(dst, src, width);
-         break;
-      default:
-         assert(0);
-      }
-   }
-
-   free(buffer);
-
-   /* unmap the stencil buffer */
-   pipe_transfer_unmap(pipe, ptDraw);
-   pipe->transfer_destroy(pipe, ptDraw);
-}
-
-
-static void
-st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy,
-              GLsizei width, GLsizei height,
-              GLint dstx, GLint dsty, GLenum type)
-{
-   struct st_context *st = st_context(ctx);
-   struct pipe_context *pipe = st->pipe;
-   struct pipe_screen *screen = pipe->screen;
-   struct st_renderbuffer *rbRead;
-   void *driver_vp, *driver_fp;
-   struct pipe_resource *pt;
-   struct pipe_sampler_view *sv[2];
-   int num_sampler_view = 1;
-   GLfloat *color;
-   enum pipe_format srcFormat, texFormat;
-   GLboolean invertTex = GL_FALSE;
-   GLint readX, readY, readW, readH;
-   GLuint sample_count;
-   struct gl_pixelstore_attrib pack = ctx->DefaultPacking;
-   struct st_fp_variant *fpv;
-
-   st_validate_state(st);
-
-   if (type == GL_STENCIL) {
-      /* can't use texturing to do stencil */
-      copy_stencil_pixels(ctx, srcx, srcy, width, height, dstx, dsty);
-      return;
-   }
-
-   /*
-    * Get vertex/fragment shaders
-    */
-   if (type == GL_COLOR) {
-      rbRead = st_get_color_read_renderbuffer(ctx);
-      color = NULL;
-
-      fpv = get_color_fp_variant(st);
-      driver_fp = fpv->driver_shader;
-
-      driver_vp = make_passthrough_vertex_shader(st, GL_FALSE);
-
-      if (st->pixel_xfer.pixelmap_enabled) {
-	  sv[1] = st->pixel_xfer.pixelmap_sampler_view;
-	  num_sampler_view++;
-      }
-   }
-   else {
-      assert(type == GL_DEPTH);
-      rbRead = st_renderbuffer(ctx->ReadBuffer->_DepthBuffer);
-      color = ctx->Current.Attrib[VERT_ATTRIB_COLOR0];
-
-      fpv = get_depth_stencil_fp_variant(st, GL_TRUE, GL_FALSE);
-      driver_fp = fpv->driver_shader;
-
-      driver_vp = make_passthrough_vertex_shader(st, GL_TRUE);
-   }
-
-   /* update fragment program constants */
-   st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
-
-
-   if (rbRead->Base.Wrapped)
-      rbRead = st_renderbuffer(rbRead->Base.Wrapped);
-
-   sample_count = rbRead->texture->nr_samples;
-   /* I believe this would be legal, presumably would need to do a resolve
-      for color, and for depth/stencil spec says to just use one of the
-      depth/stencil samples per pixel? Need some transfer clarifications. */
-   assert(sample_count < 2);
-
-   srcFormat = rbRead->texture->format;
-
-   if (screen->is_format_supported(screen, srcFormat, st->internal_target,
-                                   sample_count,
-                                   PIPE_BIND_SAMPLER_VIEW, 0)) {
-      texFormat = srcFormat;
-   }
-   else {
-      /* srcFormat can't be used as a texture format */
-      if (type == GL_DEPTH) {
-         texFormat = st_choose_format(screen, GL_DEPTH_COMPONENT,
-                                      st->internal_target, sample_count,
-                                      PIPE_BIND_DEPTH_STENCIL);
-         assert(texFormat != PIPE_FORMAT_NONE);
-      }
-      else {
-         /* default color format */
-         texFormat = st_choose_format(screen, GL_RGBA, st->internal_target,
-                                      sample_count, PIPE_BIND_SAMPLER_VIEW);
-         assert(texFormat != PIPE_FORMAT_NONE);
-      }
-   }
-
-   /* Invert src region if needed */
-   if (st_fb_orientation(ctx->ReadBuffer) == Y_0_TOP) {
-      srcy = ctx->ReadBuffer->Height - srcy - height;
-      invertTex = !invertTex;
-   }
-
-   /* Clip the read region against the src buffer bounds.
-    * We'll still allocate a temporary buffer/texture for the original
-    * src region size but we'll only read the region which is on-screen.
-    * This may mean that we draw garbage pixels into the dest region, but
-    * that's expected.
-    */
-   readX = srcx;
-   readY = srcy;
-   readW = width;
-   readH = height;
-   _mesa_clip_readpixels(ctx, &readX, &readY, &readW, &readH, &pack);
-   readW = MAX2(0, readW);
-   readH = MAX2(0, readH);
-
-   /* alloc temporary texture */
-   pt = alloc_texture(st, width, height, texFormat);
-   if (!pt)
-      return;
-
-   sv[0] = st_create_texture_sampler_view(st->pipe, pt);
-   if (!sv[0]) {
-      pipe_resource_reference(&pt, NULL);
-      return;
-   }
-
-   /* Make temporary texture which is a copy of the src region.
-    */
-   if (srcFormat == texFormat) {
-      struct pipe_box src_box;
-      u_box_2d(readX, readY, readW, readH, &src_box);
-    /* copy source framebuffer surface into mipmap/texture */
-      pipe->resource_copy_region(pipe,
-                                 pt,                                /* dest tex */
-                                 0,
-                                 pack.SkipPixels, pack.SkipRows, 0, /* dest pos */
-                                 rbRead->texture,                   /* src tex */
-                                 0,
-                                 &src_box);
-
-   }
-   else {
-      /* CPU-based fallback/conversion */
-      struct pipe_transfer *ptRead =
-         pipe_get_transfer(st->pipe, rbRead->texture, 0, 0,
-                           PIPE_TRANSFER_READ,
-                           readX, readY, readW, readH);
-      struct pipe_transfer *ptTex;
-      enum pipe_transfer_usage transfer_usage;
-
-      if (ST_DEBUG & DEBUG_FALLBACK)
-         debug_printf("%s: fallback processing\n", __FUNCTION__);
-
-      if (type == GL_DEPTH && util_format_is_depth_and_stencil(pt->format))
-         transfer_usage = PIPE_TRANSFER_READ_WRITE;
-      else
-         transfer_usage = PIPE_TRANSFER_WRITE;
-
-      ptTex = pipe_get_transfer(st->pipe, pt, 0, 0, transfer_usage,
-                                0, 0, width, height);
-
-      /* copy image from ptRead surface to ptTex surface */
-      if (type == GL_COLOR) {
-         /* alternate path using get/put_tile() */
-         GLfloat *buf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat));
-         enum pipe_format readFormat, drawFormat;
-         readFormat = util_format_linear(rbRead->texture->format);
-         drawFormat = util_format_linear(pt->format);
-         pipe_get_tile_rgba_format(pipe, ptRead, readX, readY, readW, readH,
-                                   readFormat, buf);
-         pipe_put_tile_rgba_format(pipe, ptTex, pack.SkipPixels, pack.SkipRows,
-                                   readW, readH, drawFormat, buf);
-         free(buf);
-      }
-      else {
-         /* GL_DEPTH */
-         GLuint *buf = (GLuint *) malloc(width * height * sizeof(GLuint));
-         pipe_get_tile_z(pipe, ptRead, readX, readY, readW, readH, buf);
-         pipe_put_tile_z(pipe, ptTex, pack.SkipPixels, pack.SkipRows,
-                         readW, readH, buf);
-         free(buf);
-      }
-
-      pipe->transfer_destroy(pipe, ptRead);
-      pipe->transfer_destroy(pipe, ptTex);
-   }
-
-   /* OK, the texture 'pt' contains the src image/pixels.  Now draw a
-    * textured quad with that texture.
-    */
-   draw_textured_quad(ctx, dstx, dsty, ctx->Current.RasterPos[2],
-                      width, height, ctx->Pixel.ZoomX, ctx->Pixel.ZoomY,
-                      sv,
-                      num_sampler_view,
-                      driver_vp, 
-                      driver_fp,
-                      color, invertTex, GL_FALSE, GL_FALSE);
-
-   pipe_resource_reference(&pt, NULL);
-   pipe_sampler_view_reference(&sv[0], NULL);
-}
-
-
-
-void st_init_drawpixels_functions(struct dd_function_table *functions)
-{
-   functions->DrawPixels = st_DrawPixels;
-   functions->CopyPixels = st_CopyPixels;
-}
-
-
-void
-st_destroy_drawpix(struct st_context *st)
-{
-   GLuint i;
-
-   for (i = 0; i < Elements(st->drawpix.shaders); i++) {
-      if (st->drawpix.shaders[i])
-         _mesa_reference_fragprog(st->ctx, &st->drawpix.shaders[i], NULL);
-   }
-
-   st_reference_fragprog(st, &st->pixel_xfer.combined_prog, NULL);
-   if (st->drawpix.vert_shaders[0])
-      ureg_free_tokens(st->drawpix.vert_shaders[0]);
-   if (st->drawpix.vert_shaders[1])
-      ureg_free_tokens(st->drawpix.vert_shaders[1]);
-}
-
-#endif /* FEATURE_drawpix */
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Brian Paul
+  */
+
+#include "main/imports.h"
+#include "main/image.h"
+#include "main/bufferobj.h"
+#include "main/macros.h"
+#include "main/mfeatures.h"
+#include "main/mtypes.h"
+#include "main/pack.h"
+#include "main/texformat.h"
+#include "main/texstore.h"
+#include "program/program.h"
+#include "program/prog_print.h"
+#include "program/prog_instruction.h"
+
+#include "st_atom.h"
+#include "st_atom_constbuf.h"
+#include "st_cb_drawpixels.h"
+#include "st_cb_readpixels.h"
+#include "st_cb_fbo.h"
+#include "st_context.h"
+#include "st_debug.h"
+#include "st_format.h"
+#include "st_program.h"
+#include "st_texture.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "tgsi/tgsi_ureg.h"
+#include "util/u_draw_quad.h"
+#include "util/u_format.h"
+#include "util/u_inlines.h"
+#include "util/u_math.h"
+#include "util/u_tile.h"
+#include "cso_cache/cso_context.h"
+
+
+#if FEATURE_drawpix
+
+/**
+ * Check if the given program is:
+ * 0: MOVE result.color, fragment.color;
+ * 1: END;
+ */
+static GLboolean
+is_passthrough_program(const struct gl_fragment_program *prog)
+{
+   if (prog->Base.NumInstructions == 2) {
+      const struct prog_instruction *inst = prog->Base.Instructions;
+      if (inst[0].Opcode == OPCODE_MOV &&
+          inst[1].Opcode == OPCODE_END &&
+          inst[0].DstReg.File == PROGRAM_OUTPUT &&
+          inst[0].DstReg.Index == FRAG_RESULT_COLOR &&
+          inst[0].DstReg.WriteMask == WRITEMASK_XYZW &&
+          inst[0].SrcReg[0].File == PROGRAM_INPUT &&
+          inst[0].SrcReg[0].Index == FRAG_ATTRIB_COL0 &&
+          inst[0].SrcReg[0].Swizzle == SWIZZLE_XYZW) {
+         return GL_TRUE;
+      }
+   }
+   return GL_FALSE;
+}
+
+
+
+/**
+ * Make fragment shader for glDraw/CopyPixels.  This shader is made
+ * by combining the pixel transfer shader with the user-defined shader.
+ * \param fpIn  the current/incoming fragment program
+ * \param fpOut  returns the combined fragment program
+ */
+void
+st_make_drawpix_fragment_program(struct st_context *st,
+                                 struct gl_fragment_program *fpIn,
+                                 struct gl_fragment_program **fpOut)
+{
+   struct gl_program *newProg;
+
+   if (is_passthrough_program(fpIn)) {
+      newProg = (struct gl_program *) _mesa_clone_fragment_program(st->ctx,
+                                             &st->pixel_xfer.program->Base);
+   }
+   else {
+#if 0
+      /* debug */
+      printf("Base program:\n");
+      _mesa_print_program(&fpIn->Base);
+      printf("DrawPix program:\n");
+      _mesa_print_program(&st->pixel_xfer.program->Base.Base);
+#endif
+      newProg = _mesa_combine_programs(st->ctx,
+                                       &st->pixel_xfer.program->Base.Base,
+                                       &fpIn->Base);
+   }
+
+#if 0
+   /* debug */
+   printf("Combined DrawPixels program:\n");
+   _mesa_print_program(newProg);
+   printf("InputsRead: 0x%x\n", newProg->InputsRead);
+   printf("OutputsWritten: 0x%x\n", newProg->OutputsWritten);
+   _mesa_print_parameter_list(newProg->Parameters);
+#endif
+
+   *fpOut = (struct gl_fragment_program *) newProg;
+}
+
+
+/**
+ * Create fragment program that does a TEX() instruction to get a Z and/or
+ * stencil value value, then writes to FRAG_RESULT_DEPTH/FRAG_RESULT_STENCIL.
+ * Used for glDrawPixels(GL_DEPTH_COMPONENT / GL_STENCIL_INDEX).
+ * Pass fragment color through as-is.
+ * \return pointer to the gl_fragment program
+ */
+struct gl_fragment_program *
+st_make_drawpix_z_stencil_program(struct st_context *st,
+                                  GLboolean write_depth,
+                                  GLboolean write_stencil)
+{
+   struct gl_context *ctx = st->ctx;
+   struct gl_program *p;
+   struct gl_fragment_program *fp;
+   GLuint ic = 0;
+   const GLuint shaderIndex = write_depth * 2 + write_stencil;
+
+   assert(shaderIndex < Elements(st->drawpix.shaders));
+
+   if (st->drawpix.shaders[shaderIndex]) {
+      /* already have the proper shader */
+      return st->drawpix.shaders[shaderIndex];
+   }
+
+   /*
+    * Create shader now
+    */
+   p = ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0);
+   if (!p)
+      return NULL;
+
+   p->NumInstructions = write_depth ? 2 : 1;
+   p->NumInstructions += write_stencil ? 1 : 0;
+
+   p->Instructions = _mesa_alloc_instructions(p->NumInstructions);
+   if (!p->Instructions) {
+      ctx->Driver.DeleteProgram(ctx, p);
+      return NULL;
+   }
+   _mesa_init_instructions(p->Instructions, p->NumInstructions);
+
+   if (write_depth) {
+      /* TEX result.depth, fragment.texcoord[0], texture[0], 2D; */
+      p->Instructions[ic].Opcode = OPCODE_TEX;
+      p->Instructions[ic].DstReg.File = PROGRAM_OUTPUT;
+      p->Instructions[ic].DstReg.Index = FRAG_RESULT_DEPTH;
+      p->Instructions[ic].DstReg.WriteMask = WRITEMASK_Z;
+      p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT;
+      p->Instructions[ic].SrcReg[0].Index = FRAG_ATTRIB_TEX0;
+      p->Instructions[ic].TexSrcUnit = 0;
+      p->Instructions[ic].TexSrcTarget = TEXTURE_2D_INDEX;
+      ic++;
+   }
+
+   if (write_stencil) {
+      /* TEX result.stencil, fragment.texcoord[0], texture[0], 2D; */
+      p->Instructions[ic].Opcode = OPCODE_TEX;
+      p->Instructions[ic].DstReg.File = PROGRAM_OUTPUT;
+      p->Instructions[ic].DstReg.Index = FRAG_RESULT_STENCIL;
+      p->Instructions[ic].DstReg.WriteMask = WRITEMASK_Y;
+      p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT;
+      p->Instructions[ic].SrcReg[0].Index = FRAG_ATTRIB_TEX0;
+      p->Instructions[ic].TexSrcUnit = 1;
+      p->Instructions[ic].TexSrcTarget = TEXTURE_2D_INDEX;
+      ic++;
+   }
+
+   /* END; */
+   p->Instructions[ic++].Opcode = OPCODE_END;
+
+   assert(ic == p->NumInstructions);
+
+   p->InputsRead = FRAG_BIT_TEX0 | FRAG_BIT_COL0;
+   p->OutputsWritten = 0;
+   if (write_depth)
+      p->OutputsWritten |= (1 << FRAG_RESULT_DEPTH);
+   if (write_stencil)
+      p->OutputsWritten |= (1 << FRAG_RESULT_STENCIL);
+
+   p->SamplersUsed =  0x1;  /* sampler 0 (bit 0) is used */
+   if (write_stencil)
+      p->SamplersUsed |= 1 << 1;
+
+   fp = (struct gl_fragment_program *) p;
+
+   /* save the new shader */
+   st->drawpix.shaders[shaderIndex] = fp;
+
+   return fp;
+}
+
+
+/**
+ * Create a simple vertex shader that just passes through the
+ * vertex position and texcoord (and optionally, color).
+ */
+static void *
+make_passthrough_vertex_shader(struct st_context *st, 
+                               GLboolean passColor)
+{
+   if (!st->drawpix.vert_shaders[passColor]) {
+      struct ureg_program *ureg = ureg_create( TGSI_PROCESSOR_VERTEX );
+
+      if (ureg == NULL)
+         return NULL;
+
+      /* MOV result.pos, vertex.pos; */
+      ureg_MOV(ureg, 
+               ureg_DECL_output( ureg, TGSI_SEMANTIC_POSITION, 0 ),
+               ureg_DECL_vs_input( ureg, 0 ));
+      
+      /* MOV result.texcoord0, vertex.attr[1]; */
+      ureg_MOV(ureg, 
+               ureg_DECL_output( ureg, TGSI_SEMANTIC_GENERIC, 0 ),
+               ureg_DECL_vs_input( ureg, 1 ));
+      
+      if (passColor) {
+         /* MOV result.color0, vertex.attr[2]; */
+         ureg_MOV(ureg, 
+                  ureg_DECL_output( ureg, TGSI_SEMANTIC_COLOR, 0 ),
+                  ureg_DECL_vs_input( ureg, 2 ));
+      }
+
+      ureg_END( ureg );
+      
+      st->drawpix.vert_shaders[passColor] = 
+         ureg_create_shader_and_destroy( ureg, st->pipe );
+   }
+
+   return st->drawpix.vert_shaders[passColor];
+}
+
+
+/**
+ * Return a texture base format for drawing/copying an image
+ * of the given format.
+ */
+static GLenum
+base_format(GLenum format)
+{
+   switch (format) {
+   case GL_DEPTH_COMPONENT:
+      return GL_DEPTH_COMPONENT;
+   case GL_DEPTH_STENCIL:
+      return GL_DEPTH_STENCIL;
+   case GL_STENCIL_INDEX:
+      return GL_STENCIL_INDEX;
+   default:
+      return GL_RGBA;
+   }
+}
+
+
+/**
+ * Return a texture internalFormat for drawing/copying an image
+ * of the given format and type.
+ */
+static GLenum
+internal_format(GLenum format, GLenum type)
+{
+   switch (format) {
+   case GL_DEPTH_COMPONENT:
+      return GL_DEPTH_COMPONENT;
+   case GL_DEPTH_STENCIL:
+      return GL_DEPTH_STENCIL;
+   case GL_STENCIL_INDEX:
+      return GL_STENCIL_INDEX;
+   default:
+      if (_mesa_is_integer_format(format)) {
+         switch (type) {
+         case GL_BYTE:
+            return GL_RGBA8I;
+         case GL_UNSIGNED_BYTE:
+            return GL_RGBA8UI;
+         case GL_SHORT:
+            return GL_RGBA16I;
+         case GL_UNSIGNED_SHORT:
+            return GL_RGBA16UI;
+         case GL_INT:
+            return GL_RGBA32I;
+         case GL_UNSIGNED_INT:
+            return GL_RGBA32UI;
+         default:
+            assert(0 && "Unexpected type in internal_format()");
+            return GL_RGBA_INTEGER;
+         }
+      }
+      else {
+         return GL_RGBA;
+      }
+   }
+}
+
+
+/**
+ * Create a temporary texture to hold an image of the given size.
+ * If width, height are not POT and the driver only handles POT textures,
+ * allocate the next larger size of texture that is POT.
+ */
+static struct pipe_resource *
+alloc_texture(struct st_context *st, GLsizei width, GLsizei height,
+              enum pipe_format texFormat)
+{
+   struct pipe_resource *pt;
+
+   pt = st_texture_create(st, st->internal_target, texFormat, 0,
+                          width, height, 1, 1, PIPE_BIND_SAMPLER_VIEW);
+
+   return pt;
+}
+
+
+/**
+ * Make texture containing an image for glDrawPixels image.
+ * If 'pixels' is NULL, leave the texture image data undefined.
+ */
+static struct pipe_resource *
+make_texture(struct st_context *st,
+	     GLsizei width, GLsizei height, GLenum format, GLenum type,
+	     const struct gl_pixelstore_attrib *unpack,
+	     const GLvoid *pixels)
+{
+   struct gl_context *ctx = st->ctx;
+   struct pipe_context *pipe = st->pipe;
+   gl_format mformat;
+   struct pipe_resource *pt;
+   enum pipe_format pipeFormat;
+   GLuint cpp;
+   GLenum baseFormat, intFormat;
+
+   baseFormat = base_format(format);
+   intFormat = internal_format(format, type);
+
+   mformat = st_ChooseTextureFormat_renderable(ctx, intFormat,
+                                               format, type, GL_FALSE);
+   assert(mformat);
+
+   pipeFormat = st_mesa_format_to_pipe_format(mformat);
+   assert(pipeFormat);
+   cpp = util_format_get_blocksize(pipeFormat);
+
+   pixels = _mesa_map_pbo_source(ctx, unpack, pixels);
+   if (!pixels)
+      return NULL;
+
+   /* alloc temporary texture */
+   pt = alloc_texture(st, width, height, pipeFormat);
+   if (!pt) {
+      _mesa_unmap_pbo_source(ctx, unpack);
+      return NULL;
+   }
+
+   {
+      struct pipe_transfer *transfer;
+      static const GLuint dstImageOffsets = 0;
+      GLboolean success;
+      GLubyte *dest;
+      const GLbitfield imageTransferStateSave = ctx->_ImageTransferState;
+
+      /* we'll do pixel transfer in a fragment shader */
+      ctx->_ImageTransferState = 0x0;
+
+      transfer = pipe_get_transfer(st->pipe, pt, 0, 0,
+                                   PIPE_TRANSFER_WRITE, 0, 0,
+                                   width, height);
+
+      /* map texture transfer */
+      dest = pipe_transfer_map(pipe, transfer);
+
+
+      /* Put image into texture transfer.
+       * Note that the image is actually going to be upside down in
+       * the texture.  We deal with that with texcoords.
+       */
+      success = _mesa_texstore(ctx, 2,           /* dims */
+                               baseFormat,       /* baseInternalFormat */
+                               mformat,          /* gl_format */
+                               dest,             /* dest */
+                               0, 0, 0,          /* dstX/Y/Zoffset */
+                               transfer->stride, /* dstRowStride, bytes */
+                               &dstImageOffsets, /* dstImageOffsets */
+                               width, height, 1, /* size */
+                               format, type,     /* src format/type */
+                               pixels,           /* data source */
+                               unpack);
+
+      /* unmap */
+      pipe_transfer_unmap(pipe, transfer);
+      pipe->transfer_destroy(pipe, transfer);
+
+      assert(success);
+
+      /* restore */
+      ctx->_ImageTransferState = imageTransferStateSave;
+   }
+
+   _mesa_unmap_pbo_source(ctx, unpack);
+
+   return pt;
+}
+
+
+/**
+ * Draw quad with texcoords and optional color.
+ * Coords are gallium window coords with y=0=top.
+ * \param color  may be null
+ * \param invertTex  if true, flip texcoords vertically
+ */
+static void
+draw_quad(struct gl_context *ctx, GLfloat x0, GLfloat y0, GLfloat z,
+          GLfloat x1, GLfloat y1, const GLfloat *color,
+          GLboolean invertTex, GLfloat maxXcoord, GLfloat maxYcoord)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+   GLfloat verts[4][3][4]; /* four verts, three attribs, XYZW */
+
+   /* setup vertex data */
+   {
+      const struct gl_framebuffer *fb = st->ctx->DrawBuffer;
+      const GLfloat fb_width = (GLfloat) fb->Width;
+      const GLfloat fb_height = (GLfloat) fb->Height;
+      const GLfloat clip_x0 = x0 / fb_width * 2.0f - 1.0f;
+      const GLfloat clip_y0 = y0 / fb_height * 2.0f - 1.0f;
+      const GLfloat clip_x1 = x1 / fb_width * 2.0f - 1.0f;
+      const GLfloat clip_y1 = y1 / fb_height * 2.0f - 1.0f;
+      const GLfloat sLeft = 0.0f, sRight = maxXcoord;
+      const GLfloat tTop = invertTex ? maxYcoord : 0.0f;
+      const GLfloat tBot = invertTex ? 0.0f : maxYcoord;
+      GLuint i;
+
+      /* upper-left */
+      verts[0][0][0] = clip_x0;    /* v[0].attr[0].x */
+      verts[0][0][1] = clip_y0;    /* v[0].attr[0].y */
+
+      /* upper-right */
+      verts[1][0][0] = clip_x1;
+      verts[1][0][1] = clip_y0;
+
+      /* lower-right */
+      verts[2][0][0] = clip_x1;
+      verts[2][0][1] = clip_y1;
+
+      /* lower-left */
+      verts[3][0][0] = clip_x0;
+      verts[3][0][1] = clip_y1;
+
+      verts[0][1][0] = sLeft; /* v[0].attr[1].S */
+      verts[0][1][1] = tTop;  /* v[0].attr[1].T */
+      verts[1][1][0] = sRight;
+      verts[1][1][1] = tTop;
+      verts[2][1][0] = sRight;
+      verts[2][1][1] = tBot;
+      verts[3][1][0] = sLeft;
+      verts[3][1][1] = tBot;
+
+      /* same for all verts: */
+      if (color) {
+         for (i = 0; i < 4; i++) {
+            verts[i][0][2] = z;         /* v[i].attr[0].z */
+            verts[i][0][3] = 1.0f;      /* v[i].attr[0].w */
+            verts[i][2][0] = color[0];  /* v[i].attr[2].r */
+            verts[i][2][1] = color[1];  /* v[i].attr[2].g */
+            verts[i][2][2] = color[2];  /* v[i].attr[2].b */
+            verts[i][2][3] = color[3];  /* v[i].attr[2].a */
+            verts[i][1][2] = 0.0f;      /* v[i].attr[1].R */
+            verts[i][1][3] = 1.0f;      /* v[i].attr[1].Q */
+         }
+      }
+      else {
+         for (i = 0; i < 4; i++) {
+            verts[i][0][2] = z;    /*Z*/
+            verts[i][0][3] = 1.0f; /*W*/
+            verts[i][1][2] = 0.0f; /*R*/
+            verts[i][1][3] = 1.0f; /*Q*/
+         }
+      }
+   }
+
+   {
+      struct pipe_resource *buf;
+
+      /* allocate/load buffer object with vertex data */
+      buf = pipe_buffer_create(pipe->screen,
+			       PIPE_BIND_VERTEX_BUFFER,
+			       PIPE_USAGE_STATIC,
+                               sizeof(verts));
+      pipe_buffer_write(st->pipe, buf, 0, sizeof(verts), verts);
+
+      util_draw_vertex_buffer(pipe, st->cso_context, buf, 0,
+                              PIPE_PRIM_QUADS,
+                              4,  /* verts */
+                              3); /* attribs/vert */
+      pipe_resource_reference(&buf, NULL);
+   }
+}
+
+
+
+static void
+draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
+                   GLsizei width, GLsizei height,
+                   GLfloat zoomX, GLfloat zoomY,
+                   struct pipe_sampler_view **sv,
+                   int num_sampler_view,
+                   void *driver_vp,
+                   void *driver_fp,
+                   const GLfloat *color,
+                   GLboolean invertTex,
+                   GLboolean write_depth, GLboolean write_stencil)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+   struct cso_context *cso = st->cso_context;
+   GLfloat x0, y0, x1, y1;
+   GLsizei maxSize;
+   boolean normalized = sv[0]->texture->target != PIPE_TEXTURE_RECT;
+
+   /* limit checks */
+   /* XXX if DrawPixels image is larger than max texture size, break
+    * it up into chunks.
+    */
+   maxSize = 1 << (pipe->screen->get_param(pipe->screen,
+                                        PIPE_CAP_MAX_TEXTURE_2D_LEVELS) - 1);
+   assert(width <= maxSize);
+   assert(height <= maxSize);
+
+   cso_save_rasterizer(cso);
+   cso_save_viewport(cso);
+   cso_save_samplers(cso);
+   cso_save_fragment_sampler_views(cso);
+   cso_save_fragment_shader(cso);
+   cso_save_vertex_shader(cso);
+   cso_save_vertex_elements(cso);
+   cso_save_vertex_buffers(cso);
+   if (write_stencil) {
+      cso_save_depth_stencil_alpha(cso);
+      cso_save_blend(cso);
+   }
+
+   /* rasterizer state: just scissor */
+   {
+      struct pipe_rasterizer_state rasterizer;
+      memset(&rasterizer, 0, sizeof(rasterizer));
+      rasterizer.gl_rasterization_rules = 1;
+      rasterizer.scissor = ctx->Scissor.Enabled;
+      cso_set_rasterizer(cso, &rasterizer);
+   }
+
+   if (write_stencil) {
+      /* Stencil writing bypasses the normal fragment pipeline to
+       * disable color writing and set stencil test to always pass.
+       */
+      struct pipe_depth_stencil_alpha_state dsa;
+      struct pipe_blend_state blend;
+
+      /* depth/stencil */
+      memset(&dsa, 0, sizeof(dsa));
+      dsa.stencil[0].enabled = 1;
+      dsa.stencil[0].func = PIPE_FUNC_ALWAYS;
+      dsa.stencil[0].writemask = ctx->Stencil.WriteMask[0] & 0xff;
+      dsa.stencil[0].zpass_op = PIPE_STENCIL_OP_REPLACE;
+      if (write_depth) {
+         /* writing depth+stencil: depth test always passes */
+         dsa.depth.enabled = 1;
+         dsa.depth.writemask = ctx->Depth.Mask;
+         dsa.depth.func = PIPE_FUNC_ALWAYS;
+      }
+      cso_set_depth_stencil_alpha(cso, &dsa);
+
+      /* blend (colormask) */
+      memset(&blend, 0, sizeof(blend));
+      cso_set_blend(cso, &blend);
+   }
+
+   /* fragment shader state: TEX lookup program */
+   cso_set_fragment_shader_handle(cso, driver_fp);
+
+   /* vertex shader state: position + texcoord pass-through */
+   cso_set_vertex_shader_handle(cso, driver_vp);
+
+
+   /* texture sampling state: */
+   {
+      struct pipe_sampler_state sampler;
+      memset(&sampler, 0, sizeof(sampler));
+      sampler.wrap_s = PIPE_TEX_WRAP_CLAMP;
+      sampler.wrap_t = PIPE_TEX_WRAP_CLAMP;
+      sampler.wrap_r = PIPE_TEX_WRAP_CLAMP;
+      sampler.min_img_filter = PIPE_TEX_FILTER_NEAREST;
+      sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+      sampler.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
+      sampler.normalized_coords = normalized;
+
+      cso_single_sampler(cso, 0, &sampler);
+      if (num_sampler_view > 1) {
+         cso_single_sampler(cso, 1, &sampler);
+      }
+      cso_single_sampler_done(cso);
+   }
+
+   /* viewport state: viewport matching window dims */
+   {
+      const float w = (float) ctx->DrawBuffer->Width;
+      const float h = (float) ctx->DrawBuffer->Height;
+      struct pipe_viewport_state vp;
+      vp.scale[0] =  0.5f * w;
+      vp.scale[1] = -0.5f * h;
+      vp.scale[2] = 0.5f;
+      vp.scale[3] = 1.0f;
+      vp.translate[0] = 0.5f * w;
+      vp.translate[1] = 0.5f * h;
+      vp.translate[2] = 0.5f;
+      vp.translate[3] = 0.0f;
+      cso_set_viewport(cso, &vp);
+   }
+
+   cso_set_vertex_elements(cso, 3, st->velems_util_draw);
+
+   /* texture state: */
+   cso_set_fragment_sampler_views(cso, num_sampler_view, sv);
+
+   /* Compute Gallium window coords (y=0=top) with pixel zoom.
+    * Recall that these coords are transformed by the current
+    * vertex shader and viewport transformation.
+    */
+   if (st_fb_orientation(ctx->DrawBuffer) == Y_0_BOTTOM) {
+      y = ctx->DrawBuffer->Height - (int) (y + height * ctx->Pixel.ZoomY);
+      invertTex = !invertTex;
+   }
+
+   x0 = (GLfloat) x;
+   x1 = x + width * ctx->Pixel.ZoomX;
+   y0 = (GLfloat) y;
+   y1 = y + height * ctx->Pixel.ZoomY;
+
+   /* convert Z from [0,1] to [-1,-1] to match viewport Z scale/bias */
+   z = z * 2.0 - 1.0;
+
+   draw_quad(ctx, x0, y0, z, x1, y1, color, invertTex,
+             normalized ? ((GLfloat) width / sv[0]->texture->width0) : (GLfloat)width,
+             normalized ? ((GLfloat) height / sv[0]->texture->height0) : (GLfloat)height);
+
+   /* restore state */
+   cso_restore_rasterizer(cso);
+   cso_restore_viewport(cso);
+   cso_restore_samplers(cso);
+   cso_restore_fragment_sampler_views(cso);
+   cso_restore_fragment_shader(cso);
+   cso_restore_vertex_shader(cso);
+   cso_restore_vertex_elements(cso);
+   cso_restore_vertex_buffers(cso);
+   if (write_stencil) {
+      cso_restore_depth_stencil_alpha(cso);
+      cso_restore_blend(cso);
+   }
+}
+
+
+/**
+ * Software fallback to do glDrawPixels(GL_STENCIL_INDEX) when we
+ * can't use a fragment shader to write stencil values.
+ */
+static void
+draw_stencil_pixels(struct gl_context *ctx, GLint x, GLint y,
+                    GLsizei width, GLsizei height, GLenum format, GLenum type,
+                    const struct gl_pixelstore_attrib *unpack,
+                    const GLvoid *pixels)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+   struct st_renderbuffer *strb;
+   enum pipe_transfer_usage usage;
+   struct pipe_transfer *pt;
+   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0 || ctx->Pixel.ZoomY != 1.0;
+   GLint skipPixels;
+   ubyte *stmap;
+   struct gl_pixelstore_attrib clippedUnpack = *unpack;
+
+   if (!zoom) {
+      if (!_mesa_clip_drawpixels(ctx, &x, &y, &width, &height,
+                                 &clippedUnpack)) {
+         /* totally clipped */
+         return;
+      }
+   }
+
+   strb = st_renderbuffer(ctx->DrawBuffer->
+                          Attachment[BUFFER_STENCIL].Renderbuffer);
+
+   if (st_fb_orientation(ctx->DrawBuffer) == Y_0_TOP) {
+      y = ctx->DrawBuffer->Height - y - height;
+   }
+
+   if(format != GL_DEPTH_STENCIL && 
+      util_format_get_component_bits(strb->format,
+                                     UTIL_FORMAT_COLORSPACE_ZS, 0) != 0)
+      usage = PIPE_TRANSFER_READ_WRITE;
+   else
+      usage = PIPE_TRANSFER_WRITE;
+
+   pt = pipe_get_transfer(st_context(ctx)->pipe, strb->texture, 0, 0,
+                                     usage, x, y,
+                                     width, height);
+
+   stmap = pipe_transfer_map(pipe, pt);
+
+   pixels = _mesa_map_pbo_source(ctx, &clippedUnpack, pixels);
+   assert(pixels);
+
+   /* if width > MAX_WIDTH, have to process image in chunks */
+   skipPixels = 0;
+   while (skipPixels < width) {
+      const GLint spanX = skipPixels;
+      const GLint spanWidth = MIN2(width - skipPixels, MAX_WIDTH);
+      GLint row;
+      for (row = 0; row < height; row++) {
+         GLubyte sValues[MAX_WIDTH];
+         GLuint zValues[MAX_WIDTH];
+         GLenum destType = GL_UNSIGNED_BYTE;
+         const GLvoid *source = _mesa_image_address2d(&clippedUnpack, pixels,
+                                                      width, height,
+                                                      format, type,
+                                                      row, skipPixels);
+         _mesa_unpack_stencil_span(ctx, spanWidth, destType, sValues,
+                                   type, source, &clippedUnpack,
+                                   ctx->_ImageTransferState);
+
+         if (format == GL_DEPTH_STENCIL) {
+            _mesa_unpack_depth_span(ctx, spanWidth, GL_UNSIGNED_INT, zValues,
+                                    (1 << 24) - 1, type, source,
+                                    &clippedUnpack);
+         }
+
+         if (zoom) {
+            _mesa_problem(ctx, "Gallium glDrawPixels(GL_STENCIL) with "
+                          "zoom not complete");
+         }
+
+         {
+            GLint spanY;
+
+            if (st_fb_orientation(ctx->DrawBuffer) == Y_0_TOP) {
+               spanY = height - row - 1;
+            }
+            else {
+               spanY = row;
+            }
+
+            /* now pack the stencil (and Z) values in the dest format */
+            switch (pt->resource->format) {
+            case PIPE_FORMAT_S8_USCALED:
+               {
+                  ubyte *dest = stmap + spanY * pt->stride + spanX;
+                  assert(usage == PIPE_TRANSFER_WRITE);
+                  memcpy(dest, sValues, spanWidth);
+               }
+               break;
+            case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+               if (format == GL_DEPTH_STENCIL) {
+                  uint *dest = (uint *) (stmap + spanY * pt->stride + spanX*4);
+                  GLint k;
+                  assert(usage == PIPE_TRANSFER_WRITE);
+                  for (k = 0; k < spanWidth; k++) {
+                     dest[k] = zValues[k] | (sValues[k] << 24);
+                  }
+               }
+               else {
+                  uint *dest = (uint *) (stmap + spanY * pt->stride + spanX*4);
+                  GLint k;
+                  assert(usage == PIPE_TRANSFER_READ_WRITE);
+                  for (k = 0; k < spanWidth; k++) {
+                     dest[k] = (dest[k] & 0xffffff) | (sValues[k] << 24);
+                  }
+               }
+               break;
+            case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+               if (format == GL_DEPTH_STENCIL) {
+                  uint *dest = (uint *) (stmap + spanY * pt->stride + spanX*4);
+                  GLint k;
+                  assert(usage == PIPE_TRANSFER_WRITE);
+                  for (k = 0; k < spanWidth; k++) {
+                     dest[k] = (zValues[k] << 8) | (sValues[k] & 0xff);
+                  }
+               }
+               else {
+                  uint *dest = (uint *) (stmap + spanY * pt->stride + spanX*4);
+                  GLint k;
+                  assert(usage == PIPE_TRANSFER_READ_WRITE);
+                  for (k = 0; k < spanWidth; k++) {
+                     dest[k] = (dest[k] & 0xffffff00) | (sValues[k] & 0xff);
+                  }
+               }
+               break;
+            default:
+               assert(0);
+            }
+         }
+      }
+      skipPixels += spanWidth;
+   }
+
+   _mesa_unmap_pbo_source(ctx, &clippedUnpack);
+
+   /* unmap the stencil buffer */
+   pipe_transfer_unmap(pipe, pt);
+   pipe->transfer_destroy(pipe, pt);
+}
+
+
+/**
+ * Get fragment program variant for a glDrawPixels or glCopyPixels
+ * command for RGBA data.
+ */
+static struct st_fp_variant *
+get_color_fp_variant(struct st_context *st)
+{
+   struct gl_context *ctx = st->ctx;
+   struct st_fp_variant_key key;
+   struct st_fp_variant *fpv;
+
+   memset(&key, 0, sizeof(key));
+
+   key.st = st;
+   key.drawpixels = 1;
+   key.scaleAndBias = (ctx->Pixel.RedBias != 0.0 ||
+                       ctx->Pixel.RedScale != 1.0 ||
+                       ctx->Pixel.GreenBias != 0.0 ||
+                       ctx->Pixel.GreenScale != 1.0 ||
+                       ctx->Pixel.BlueBias != 0.0 ||
+                       ctx->Pixel.BlueScale != 1.0 ||
+                       ctx->Pixel.AlphaBias != 0.0 ||
+                       ctx->Pixel.AlphaScale != 1.0);
+   key.pixelMaps = ctx->Pixel.MapColorFlag;
+
+   fpv = st_get_fp_variant(st, st->fp, &key);
+
+   return fpv;
+}
+
+
+/**
+ * Get fragment program variant for a glDrawPixels or glCopyPixels
+ * command for depth/stencil data.
+ */
+static struct st_fp_variant *
+get_depth_stencil_fp_variant(struct st_context *st, GLboolean write_depth,
+                             GLboolean write_stencil)
+{
+   struct st_fp_variant_key key;
+   struct st_fp_variant *fpv;
+
+   memset(&key, 0, sizeof(key));
+
+   key.st = st;
+   key.drawpixels = 1;
+   key.drawpixels_z = write_depth;
+   key.drawpixels_stencil = write_stencil;
+
+   fpv = st_get_fp_variant(st, st->fp, &key);
+
+   return fpv;
+}
+
+
+/**
+ * Called via ctx->Driver.DrawPixels()
+ */
+static void
+st_DrawPixels(struct gl_context *ctx, GLint x, GLint y,
+              GLsizei width, GLsizei height,
+              GLenum format, GLenum type,
+              const struct gl_pixelstore_attrib *unpack, const GLvoid *pixels)
+{
+   void *driver_vp, *driver_fp;
+   struct st_context *st = st_context(ctx);
+   const GLfloat *color;
+   struct pipe_context *pipe = st->pipe;
+   GLboolean write_stencil = GL_FALSE, write_depth = GL_FALSE;
+   struct pipe_sampler_view *sv[2];
+   int num_sampler_view = 1;
+   enum pipe_format stencil_format = PIPE_FORMAT_NONE;
+   struct st_fp_variant *fpv;
+
+   if (format == GL_DEPTH_STENCIL)
+      write_stencil = write_depth = GL_TRUE;
+   else if (format == GL_STENCIL_INDEX)
+      write_stencil = GL_TRUE;
+   else if (format == GL_DEPTH_COMPONENT)
+      write_depth = GL_TRUE;
+
+   if (write_stencil) {
+      enum pipe_format tex_format;
+      /* can we write to stencil if not fallback */
+      if (!pipe->screen->get_param(pipe->screen, PIPE_CAP_SHADER_STENCIL_EXPORT))
+	 goto stencil_fallback;
+      
+      tex_format = st_choose_format(st->pipe->screen, base_format(format),
+                                    PIPE_TEXTURE_2D,
+				    0, PIPE_BIND_SAMPLER_VIEW);
+      if (tex_format == PIPE_FORMAT_Z24_UNORM_S8_USCALED)
+	 stencil_format = PIPE_FORMAT_X24S8_USCALED;
+      else if (tex_format == PIPE_FORMAT_S8_USCALED_Z24_UNORM)
+	 stencil_format = PIPE_FORMAT_S8X24_USCALED;
+      else
+	 stencil_format = PIPE_FORMAT_S8_USCALED;
+      if (stencil_format == PIPE_FORMAT_NONE)
+	 goto stencil_fallback;
+   }
+
+   /* Mesa state should be up to date by now */
+   assert(ctx->NewState == 0x0);
+
+   st_validate_state(st);
+
+   /*
+    * Get vertex/fragment shaders
+    */
+   if (write_depth || write_stencil) {
+      fpv = get_depth_stencil_fp_variant(st, write_depth, write_stencil);
+
+      driver_fp = fpv->driver_shader;
+
+      driver_vp = make_passthrough_vertex_shader(st, GL_TRUE);
+
+      color = ctx->Current.RasterColor;
+   }
+   else {
+      fpv = get_color_fp_variant(st);
+
+      driver_fp = fpv->driver_shader;
+
+      driver_vp = make_passthrough_vertex_shader(st, GL_FALSE);
+
+      color = NULL;
+      if (st->pixel_xfer.pixelmap_enabled) {
+	  sv[1] = st->pixel_xfer.pixelmap_sampler_view;
+	  num_sampler_view++;
+      }
+   }
+
+   /* update fragment program constants */
+   st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
+
+   /* draw with textured quad */
+   {
+      struct pipe_resource *pt
+         = make_texture(st, width, height, format, type, unpack, pixels);
+      if (pt) {
+         sv[0] = st_create_texture_sampler_view(st->pipe, pt);
+
+         if (sv[0]) {
+	    if (write_stencil) {
+	       sv[1] = st_create_texture_sampler_view_format(st->pipe, pt,
+                                                             stencil_format);
+	       num_sampler_view++;
+	    }
+
+            draw_textured_quad(ctx, x, y, ctx->Current.RasterPos[2],
+                               width, height,
+                               ctx->Pixel.ZoomX, ctx->Pixel.ZoomY,
+                               sv,
+                               num_sampler_view,
+                               driver_vp,
+                               driver_fp,
+                               color, GL_FALSE, write_depth, write_stencil);
+            pipe_sampler_view_reference(&sv[0], NULL);
+            if (num_sampler_view > 1)
+               pipe_sampler_view_reference(&sv[1], NULL);
+         }
+         pipe_resource_reference(&pt, NULL);
+      }
+   }
+   return;
+
+stencil_fallback:
+   draw_stencil_pixels(ctx, x, y, width, height, format, type,
+		       unpack, pixels);
+}
+
+
+
+/**
+ * Software fallback for glCopyPixels(GL_STENCIL).
+ */
+static void
+copy_stencil_pixels(struct gl_context *ctx, GLint srcx, GLint srcy,
+                    GLsizei width, GLsizei height,
+                    GLint dstx, GLint dsty)
+{
+   struct st_renderbuffer *rbDraw;
+   struct pipe_context *pipe = st_context(ctx)->pipe;
+   enum pipe_transfer_usage usage;
+   struct pipe_transfer *ptDraw;
+   ubyte *drawMap;
+   ubyte *buffer;
+   int i;
+
+   buffer = malloc(width * height * sizeof(ubyte));
+   if (!buffer) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCopyPixels(stencil)");
+      return;
+   }
+
+   /* Get the dest renderbuffer.  If there's a wrapper, use the
+    * underlying renderbuffer.
+    */
+   rbDraw = st_renderbuffer(ctx->DrawBuffer->_StencilBuffer);
+   if (rbDraw->Base.Wrapped)
+      rbDraw = st_renderbuffer(rbDraw->Base.Wrapped);
+
+   /* this will do stencil pixel transfer ops */
+   st_read_stencil_pixels(ctx, srcx, srcy, width, height,
+                          GL_STENCIL_INDEX, GL_UNSIGNED_BYTE,
+                          &ctx->DefaultPacking, buffer);
+
+   if (0) {
+      /* debug code: dump stencil values */
+      GLint row, col;
+      for (row = 0; row < height; row++) {
+         printf("%3d: ", row);
+         for (col = 0; col < width; col++) {
+            printf("%02x ", buffer[col + row * width]);
+         }
+         printf("\n");
+      }
+   }
+
+   if (util_format_get_component_bits(rbDraw->format,
+                                     UTIL_FORMAT_COLORSPACE_ZS, 0) != 0)
+      usage = PIPE_TRANSFER_READ_WRITE;
+   else
+      usage = PIPE_TRANSFER_WRITE;
+
+   if (st_fb_orientation(ctx->DrawBuffer) == Y_0_TOP) {
+      dsty = rbDraw->Base.Height - dsty - height;
+   }
+
+   ptDraw = pipe_get_transfer(st_context(ctx)->pipe,
+                              rbDraw->texture, 0, 0,
+                              usage, dstx, dsty,
+                              width, height);
+
+   assert(util_format_get_blockwidth(ptDraw->resource->format) == 1);
+   assert(util_format_get_blockheight(ptDraw->resource->format) == 1);
+
+   /* map the stencil buffer */
+   drawMap = pipe_transfer_map(pipe, ptDraw);
+
+   /* draw */
+   /* XXX PixelZoom not handled yet */
+   for (i = 0; i < height; i++) {
+      ubyte *dst;
+      const ubyte *src;
+      int y;
+
+      y = i;
+
+      if (st_fb_orientation(ctx->DrawBuffer) == Y_0_TOP) {
+         y = height - y - 1;
+      }
+
+      dst = drawMap + y * ptDraw->stride;
+      src = buffer + i * width;
+
+      switch (ptDraw->resource->format) {
+      case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+         {
+            uint *dst4 = (uint *) dst;
+            int j;
+            assert(usage == PIPE_TRANSFER_READ_WRITE);
+            for (j = 0; j < width; j++) {
+               *dst4 = (*dst4 & 0xffffff) | (src[j] << 24);
+               dst4++;
+            }
+         }
+         break;
+      case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+         {
+            uint *dst4 = (uint *) dst;
+            int j;
+            assert(usage == PIPE_TRANSFER_READ_WRITE);
+            for (j = 0; j < width; j++) {
+               *dst4 = (*dst4 & 0xffffff00) | (src[j] & 0xff);
+               dst4++;
+            }
+         }
+         break;
+      case PIPE_FORMAT_S8_USCALED:
+         assert(usage == PIPE_TRANSFER_WRITE);
+         memcpy(dst, src, width);
+         break;
+      default:
+         assert(0);
+      }
+   }
+
+   free(buffer);
+
+   /* unmap the stencil buffer */
+   pipe_transfer_unmap(pipe, ptDraw);
+   pipe->transfer_destroy(pipe, ptDraw);
+}
+
+
+static void
+st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy,
+              GLsizei width, GLsizei height,
+              GLint dstx, GLint dsty, GLenum type)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   struct st_renderbuffer *rbRead;
+   void *driver_vp, *driver_fp;
+   struct pipe_resource *pt;
+   struct pipe_sampler_view *sv[2];
+   int num_sampler_view = 1;
+   GLfloat *color;
+   enum pipe_format srcFormat, texFormat;
+   GLboolean invertTex = GL_FALSE;
+   GLint readX, readY, readW, readH;
+   GLuint sample_count;
+   struct gl_pixelstore_attrib pack = ctx->DefaultPacking;
+   struct st_fp_variant *fpv;
+
+   st_validate_state(st);
+
+   if (type == GL_STENCIL) {
+      /* can't use texturing to do stencil */
+      copy_stencil_pixels(ctx, srcx, srcy, width, height, dstx, dsty);
+      return;
+   }
+
+   /*
+    * Get vertex/fragment shaders
+    */
+   if (type == GL_COLOR) {
+      rbRead = st_get_color_read_renderbuffer(ctx);
+      color = NULL;
+
+      fpv = get_color_fp_variant(st);
+      driver_fp = fpv->driver_shader;
+
+      driver_vp = make_passthrough_vertex_shader(st, GL_FALSE);
+
+      if (st->pixel_xfer.pixelmap_enabled) {
+	  sv[1] = st->pixel_xfer.pixelmap_sampler_view;
+	  num_sampler_view++;
+      }
+   }
+   else {
+      assert(type == GL_DEPTH);
+      rbRead = st_renderbuffer(ctx->ReadBuffer->_DepthBuffer);
+      color = ctx->Current.Attrib[VERT_ATTRIB_COLOR0];
+
+      fpv = get_depth_stencil_fp_variant(st, GL_TRUE, GL_FALSE);
+      driver_fp = fpv->driver_shader;
+
+      driver_vp = make_passthrough_vertex_shader(st, GL_TRUE);
+   }
+
+   /* update fragment program constants */
+   st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT);
+
+
+   if (rbRead->Base.Wrapped)
+      rbRead = st_renderbuffer(rbRead->Base.Wrapped);
+
+   sample_count = rbRead->texture->nr_samples;
+   /* I believe this would be legal, presumably would need to do a resolve
+      for color, and for depth/stencil spec says to just use one of the
+      depth/stencil samples per pixel? Need some transfer clarifications. */
+   assert(sample_count < 2);
+
+   srcFormat = rbRead->texture->format;
+
+   if (screen->is_format_supported(screen, srcFormat, st->internal_target,
+                                   sample_count,
+                                   PIPE_BIND_SAMPLER_VIEW, 0)) {
+      texFormat = srcFormat;
+   }
+   else {
+      /* srcFormat can't be used as a texture format */
+      if (type == GL_DEPTH) {
+         texFormat = st_choose_format(screen, GL_DEPTH_COMPONENT,
+                                      st->internal_target, sample_count,
+                                      PIPE_BIND_DEPTH_STENCIL);
+         assert(texFormat != PIPE_FORMAT_NONE);
+      }
+      else {
+         /* default color format */
+         texFormat = st_choose_format(screen, GL_RGBA, st->internal_target,
+                                      sample_count, PIPE_BIND_SAMPLER_VIEW);
+         assert(texFormat != PIPE_FORMAT_NONE);
+      }
+   }
+
+   /* Invert src region if needed */
+   if (st_fb_orientation(ctx->ReadBuffer) == Y_0_TOP) {
+      srcy = ctx->ReadBuffer->Height - srcy - height;
+      invertTex = !invertTex;
+   }
+
+   /* Clip the read region against the src buffer bounds.
+    * We'll still allocate a temporary buffer/texture for the original
+    * src region size but we'll only read the region which is on-screen.
+    * This may mean that we draw garbage pixels into the dest region, but
+    * that's expected.
+    */
+   readX = srcx;
+   readY = srcy;
+   readW = width;
+   readH = height;
+   _mesa_clip_readpixels(ctx, &readX, &readY, &readW, &readH, &pack);
+   readW = MAX2(0, readW);
+   readH = MAX2(0, readH);
+
+   /* alloc temporary texture */
+   pt = alloc_texture(st, width, height, texFormat);
+   if (!pt)
+      return;
+
+   sv[0] = st_create_texture_sampler_view(st->pipe, pt);
+   if (!sv[0]) {
+      pipe_resource_reference(&pt, NULL);
+      return;
+   }
+
+   /* Make temporary texture which is a copy of the src region.
+    */
+   if (srcFormat == texFormat) {
+      struct pipe_box src_box;
+      u_box_2d(readX, readY, readW, readH, &src_box);
+    /* copy source framebuffer surface into mipmap/texture */
+      pipe->resource_copy_region(pipe,
+                                 pt,                                /* dest tex */
+                                 0,
+                                 pack.SkipPixels, pack.SkipRows, 0, /* dest pos */
+                                 rbRead->texture,                   /* src tex */
+                                 0,
+                                 &src_box);
+
+   }
+   else {
+      /* CPU-based fallback/conversion */
+      struct pipe_transfer *ptRead =
+         pipe_get_transfer(st->pipe, rbRead->texture, 0, 0,
+                           PIPE_TRANSFER_READ,
+                           readX, readY, readW, readH);
+      struct pipe_transfer *ptTex;
+      enum pipe_transfer_usage transfer_usage;
+
+      if (ST_DEBUG & DEBUG_FALLBACK)
+         debug_printf("%s: fallback processing\n", __FUNCTION__);
+
+      if (type == GL_DEPTH && util_format_is_depth_and_stencil(pt->format))
+         transfer_usage = PIPE_TRANSFER_READ_WRITE;
+      else
+         transfer_usage = PIPE_TRANSFER_WRITE;
+
+      ptTex = pipe_get_transfer(st->pipe, pt, 0, 0, transfer_usage,
+                                0, 0, width, height);
+
+      /* copy image from ptRead surface to ptTex surface */
+      if (type == GL_COLOR) {
+         /* alternate path using get/put_tile() */
+         GLfloat *buf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat));
+         enum pipe_format readFormat, drawFormat;
+         readFormat = util_format_linear(rbRead->texture->format);
+         drawFormat = util_format_linear(pt->format);
+         pipe_get_tile_rgba_format(pipe, ptRead, readX, readY, readW, readH,
+                                   readFormat, buf);
+         pipe_put_tile_rgba_format(pipe, ptTex, pack.SkipPixels, pack.SkipRows,
+                                   readW, readH, drawFormat, buf);
+         free(buf);
+      }
+      else {
+         /* GL_DEPTH */
+         GLuint *buf = (GLuint *) malloc(width * height * sizeof(GLuint));
+         pipe_get_tile_z(pipe, ptRead, readX, readY, readW, readH, buf);
+         pipe_put_tile_z(pipe, ptTex, pack.SkipPixels, pack.SkipRows,
+                         readW, readH, buf);
+         free(buf);
+      }
+
+      pipe->transfer_destroy(pipe, ptRead);
+      pipe->transfer_destroy(pipe, ptTex);
+   }
+
+   /* OK, the texture 'pt' contains the src image/pixels.  Now draw a
+    * textured quad with that texture.
+    */
+   draw_textured_quad(ctx, dstx, dsty, ctx->Current.RasterPos[2],
+                      width, height, ctx->Pixel.ZoomX, ctx->Pixel.ZoomY,
+                      sv,
+                      num_sampler_view,
+                      driver_vp, 
+                      driver_fp,
+                      color, invertTex, GL_FALSE, GL_FALSE);
+
+   pipe_resource_reference(&pt, NULL);
+   pipe_sampler_view_reference(&sv[0], NULL);
+}
+
+
+
+void st_init_drawpixels_functions(struct dd_function_table *functions)
+{
+   functions->DrawPixels = st_DrawPixels;
+   functions->CopyPixels = st_CopyPixels;
+}
+
+
+void
+st_destroy_drawpix(struct st_context *st)
+{
+   GLuint i;
+
+   for (i = 0; i < Elements(st->drawpix.shaders); i++) {
+      if (st->drawpix.shaders[i])
+         _mesa_reference_fragprog(st->ctx, &st->drawpix.shaders[i], NULL);
+   }
+
+   st_reference_fragprog(st, &st->pixel_xfer.combined_prog, NULL);
+   if (st->drawpix.vert_shaders[0])
+      ureg_free_tokens(st->drawpix.vert_shaders[0]);
+   if (st->drawpix.vert_shaders[1])
+      ureg_free_tokens(st->drawpix.vert_shaders[1]);
+}
+
+#endif /* FEATURE_drawpix */
diff --git a/mesalib/src/mesa/state_tracker/st_cb_drawtex.c b/mesalib/src/mesa/state_tracker/st_cb_drawtex.c
index 5976f1048..86ceb9d78 100644
--- a/mesalib/src/mesa/state_tracker/st_cb_drawtex.c
+++ b/mesalib/src/mesa/state_tracker/st_cb_drawtex.c
@@ -1,304 +1,307 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- **************************************************************************/
-
-
-/**
- * Implementation of glDrawTex() for GL_OES_draw_tex
- */
-
-
-
-#include "main/imports.h"
-#include "main/image.h"
-#include "main/macros.h"
-#include "main/mfeatures.h"
-#include "program/program.h"
-#include "program/prog_print.h"
-
-#include "st_context.h"
-#include "st_atom.h"
-#include "st_cb_drawtex.h"
-
-#include "pipe/p_context.h"
-#include "pipe/p_defines.h"
-#include "util/u_inlines.h"
-#include "pipe/p_shader_tokens.h"
-#include "util/u_draw_quad.h"
-#include "util/u_simple_shaders.h"
-
-#include "cso_cache/cso_context.h"
-
-
-#if FEATURE_OES_draw_texture
-
-
-struct cached_shader
-{
-   void *handle;
-
-   uint num_attribs;
-   uint semantic_names[2 + MAX_TEXTURE_UNITS];
-   uint semantic_indexes[2 + MAX_TEXTURE_UNITS];
-};
-
-#define MAX_SHADERS (2 * MAX_TEXTURE_UNITS)
-
-/**
- * Simple linear list cache.
- * Most of the time there'll only be one cached shader.
- */
-static struct cached_shader CachedShaders[MAX_SHADERS];
-static GLuint NumCachedShaders = 0;
-
-
-static void *
-lookup_shader(struct pipe_context *pipe,
-              uint num_attribs,
-              const uint *semantic_names,
-              const uint *semantic_indexes)
-{
-   GLuint i, j;
-
-   /* look for existing shader with same attributes */
-   for (i = 0; i < NumCachedShaders; i++) {
-      if (CachedShaders[i].num_attribs == num_attribs) {
-         GLboolean match = GL_TRUE;
-         for (j = 0; j < num_attribs; j++) {
-            if (semantic_names[j] != CachedShaders[i].semantic_names[j] ||
-                semantic_indexes[j] != CachedShaders[i].semantic_indexes[j]) {
-               match = GL_FALSE;
-               break;
-            }
-         }
-         if (match)
-            return CachedShaders[i].handle;
-      }
-   }
-
-   /* not found - create new one now */
-   if (NumCachedShaders >= MAX_SHADERS) {
-      return NULL;
-   }
-
-   CachedShaders[i].num_attribs = num_attribs;
-   for (j = 0; j < num_attribs; j++) {
-      CachedShaders[i].semantic_names[j] = semantic_names[j];
-      CachedShaders[i].semantic_indexes[j] = semantic_indexes[j];
-   }
-
-   CachedShaders[i].handle =
-      util_make_vertex_passthrough_shader(pipe,
-                                          num_attribs,
-                                          semantic_names,
-                                          semantic_indexes);
-   NumCachedShaders++;
-
-   return CachedShaders[i].handle;
-}
-
-static void
-st_DrawTex(struct gl_context *ctx, GLfloat x, GLfloat y, GLfloat z,
-           GLfloat width, GLfloat height)
-{
-   struct st_context *st = ctx->st;
-   struct pipe_context *pipe = st->pipe;
-   struct cso_context *cso = ctx->st->cso_context;
-   struct pipe_resource *vbuffer;
-   struct pipe_transfer *vbuffer_transfer;
-   GLuint i, numTexCoords, numAttribs;
-   GLboolean emitColor;
-   uint semantic_names[2 + MAX_TEXTURE_UNITS];
-   uint semantic_indexes[2 + MAX_TEXTURE_UNITS];
-   struct pipe_vertex_element velements[2 + MAX_TEXTURE_UNITS];
-   GLbitfield inputs = VERT_BIT_POS;
-
-   st_validate_state(st);
-
-   /* determine if we need vertex color */
-   if (ctx->FragmentProgram._Current->Base.InputsRead & FRAG_BIT_COL0)
-      emitColor = GL_TRUE;
-   else
-      emitColor = GL_FALSE;
-
-   /* determine how many enabled sets of texcoords */
-   numTexCoords = 0;
-   for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-      if (ctx->Texture.Unit[i]._ReallyEnabled & TEXTURE_2D_BIT) {
-         inputs |= VERT_BIT_TEX(i);
-         numTexCoords++;
-      }
-   }
-
-   /* total number of attributes per vertex */
-   numAttribs = 1 + emitColor + numTexCoords;
-
-
-   /* create the vertex buffer */
-   vbuffer = pipe_buffer_create(pipe->screen, PIPE_BIND_VERTEX_BUFFER,
-                                numAttribs * 4 * 4 * sizeof(GLfloat));
-
-   /* load vertex buffer */
-   {
-#define SET_ATTRIB(VERT, ATTR, X, Y, Z, W)                              \
-      do {                                                              \
-         GLuint k = (((VERT) * numAttribs + (ATTR)) * 4);               \
-         assert(k < 4 * 4 * numAttribs);                                \
-         vbuf[k + 0] = X;                                               \
-         vbuf[k + 1] = Y;                                               \
-         vbuf[k + 2] = Z;                                               \
-         vbuf[k + 3] = W;                                               \
-      } while (0)
-
-      const GLfloat x0 = x, y0 = y, x1 = x + width, y1 = y + height;
-      GLfloat *vbuf = (GLfloat *) pipe_buffer_map(pipe, vbuffer,
-                                                  PIPE_TRANSFER_WRITE,
-                                                  &vbuffer_transfer);
-      GLuint attr;
-      
-      z = CLAMP(z, 0.0f, 1.0f);
-
-      /* positions (in clip coords) */
-      {
-         const struct gl_framebuffer *fb = st->ctx->DrawBuffer;
-         const GLfloat fb_width = (GLfloat)fb->Width;
-         const GLfloat fb_height = (GLfloat)fb->Height;
-
-         const GLfloat clip_x0 = (GLfloat)(x0 / fb_width * 2.0 - 1.0);
-         const GLfloat clip_y0 = (GLfloat)(y0 / fb_height * 2.0 - 1.0);
-         const GLfloat clip_x1 = (GLfloat)(x1 / fb_width * 2.0 - 1.0);
-         const GLfloat clip_y1 = (GLfloat)(y1 / fb_height * 2.0 - 1.0);
-
-         SET_ATTRIB(0, 0, clip_x0, clip_y0, z, 1.0f);   /* lower left */
-         SET_ATTRIB(1, 0, clip_x1, clip_y0, z, 1.0f);   /* lower right */
-         SET_ATTRIB(2, 0, clip_x1, clip_y1, z, 1.0f);   /* upper right */
-         SET_ATTRIB(3, 0, clip_x0, clip_y1, z, 1.0f);   /* upper left */
-
-         semantic_names[0] = TGSI_SEMANTIC_POSITION;
-         semantic_indexes[0] = 0;
-      }
-
-      /* colors */
-      if (emitColor) {
-         const GLfloat *c = ctx->Current.Attrib[VERT_ATTRIB_COLOR0];
-         SET_ATTRIB(0, 1, c[0], c[1], c[2], c[3]);
-         SET_ATTRIB(1, 1, c[0], c[1], c[2], c[3]);
-         SET_ATTRIB(2, 1, c[0], c[1], c[2], c[3]);
-         SET_ATTRIB(3, 1, c[0], c[1], c[2], c[3]);
-         semantic_names[1] = TGSI_SEMANTIC_COLOR;
-         semantic_indexes[1] = 0;
-         attr = 2;
-      }
-      else {
-         attr = 1;
-      }
-
-      /* texcoords */
-      for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-         if (ctx->Texture.Unit[i]._ReallyEnabled & TEXTURE_2D_BIT) {
-            struct gl_texture_object *obj = ctx->Texture.Unit[i]._Current;
-            struct gl_texture_image *img = obj->Image[0][obj->BaseLevel];
-            const GLfloat wt = (GLfloat) img->Width;
-            const GLfloat ht = (GLfloat) img->Height;
-            const GLfloat s0 = obj->CropRect[0] / wt;
-            const GLfloat t0 = obj->CropRect[1] / ht;
-            const GLfloat s1 = (obj->CropRect[0] + obj->CropRect[2]) / wt;
-            const GLfloat t1 = (obj->CropRect[1] + obj->CropRect[3]) / ht;
-
-            /*printf("crop texcoords: %g, %g .. %g, %g\n", s0, t0, s1, t1);*/
-            SET_ATTRIB(0, attr, s0, t0, 0.0f, 1.0f);  /* lower left */
-            SET_ATTRIB(1, attr, s1, t0, 0.0f, 1.0f);  /* lower right */
-            SET_ATTRIB(2, attr, s1, t1, 0.0f, 1.0f);  /* upper right */
-            SET_ATTRIB(3, attr, s0, t1, 0.0f, 1.0f);  /* upper left */
-
-            semantic_names[attr] = TGSI_SEMANTIC_GENERIC;
-            semantic_indexes[attr] = 0;
-
-            attr++;
-         }
-      }
-
-      pipe_buffer_unmap(pipe, vbuffer_transfer);
-
-#undef SET_ATTRIB
-   }
-
-
-   cso_save_viewport(cso);
-   cso_save_vertex_shader(cso);
-   cso_save_vertex_elements(cso);
-
-   {
-      void *vs = lookup_shader(pipe, numAttribs,
-                               semantic_names, semantic_indexes);
-      cso_set_vertex_shader_handle(cso, vs);
-   }
-
-   for (i = 0; i < numAttribs; i++) {
-      velements[i].src_offset = i * 4 * sizeof(float);
-      velements[i].instance_divisor = 0;
-      velements[i].vertex_buffer_index = 0;
-      velements[i].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
-   }
-   cso_set_vertex_elements(cso, numAttribs, velements);
-
-   /* viewport state: viewport matching window dims */
-   {
-      const struct gl_framebuffer *fb = st->ctx->DrawBuffer;
-      const GLboolean invert = (st_fb_orientation(fb) == Y_0_TOP);
-      const GLfloat width = (GLfloat)fb->Width;
-      const GLfloat height = (GLfloat)fb->Height;
-      struct pipe_viewport_state vp;
-      vp.scale[0] =  0.5f * width;
-      vp.scale[1] = height * (invert ? -0.5f : 0.5f);
-      vp.scale[2] = 1.0f;
-      vp.scale[3] = 1.0f;
-      vp.translate[0] = 0.5f * width;
-      vp.translate[1] = 0.5f * height;
-      vp.translate[2] = 0.0f;
-      vp.translate[3] = 0.0f;
-      cso_set_viewport(cso, &vp);
-   }
-
-
-   util_draw_vertex_buffer(pipe, vbuffer,
-                           0,  /* offset */
-                           PIPE_PRIM_TRIANGLE_FAN,
-                           4,  /* verts */
-                           numAttribs); /* attribs/vert */
-
-
-   pipe_resource_reference(&vbuffer, NULL);
-
-   /* restore state */
-   cso_restore_viewport(cso);
-   cso_restore_vertex_shader(cso);
-   cso_restore_vertex_elements(cso);
-}
-
-
-void
-st_init_drawtex_functions(struct dd_function_table *functions)
-{
-   functions->DrawTex = st_DrawTex;
-}
-
-
-/**
- * Free any cached shaders
- */
-void
-st_destroy_drawtex(struct st_context *st)
-{
-   GLuint i;
-   for (i = 0; i < NumCachedShaders; i++) {
-      cso_delete_vertex_shader(st->cso_context, CachedShaders[i].handle);
-   }
-   NumCachedShaders = 0;
-}
-
-
-#endif /* FEATURE_OES_draw_texture */
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ **************************************************************************/
+
+
+/**
+ * Implementation of glDrawTex() for GL_OES_draw_tex
+ */
+
+
+
+#include "main/imports.h"
+#include "main/image.h"
+#include "main/macros.h"
+#include "main/mfeatures.h"
+#include "program/program.h"
+#include "program/prog_print.h"
+
+#include "st_context.h"
+#include "st_atom.h"
+#include "st_cb_drawtex.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_draw_quad.h"
+#include "util/u_simple_shaders.h"
+
+#include "cso_cache/cso_context.h"
+
+
+#if FEATURE_OES_draw_texture
+
+
+struct cached_shader
+{
+   void *handle;
+
+   uint num_attribs;
+   uint semantic_names[2 + MAX_TEXTURE_UNITS];
+   uint semantic_indexes[2 + MAX_TEXTURE_UNITS];
+};
+
+#define MAX_SHADERS (2 * MAX_TEXTURE_UNITS)
+
+/**
+ * Simple linear list cache.
+ * Most of the time there'll only be one cached shader.
+ */
+static struct cached_shader CachedShaders[MAX_SHADERS];
+static GLuint NumCachedShaders = 0;
+
+
+static void *
+lookup_shader(struct pipe_context *pipe,
+              uint num_attribs,
+              const uint *semantic_names,
+              const uint *semantic_indexes)
+{
+   GLuint i, j;
+
+   /* look for existing shader with same attributes */
+   for (i = 0; i < NumCachedShaders; i++) {
+      if (CachedShaders[i].num_attribs == num_attribs) {
+         GLboolean match = GL_TRUE;
+         for (j = 0; j < num_attribs; j++) {
+            if (semantic_names[j] != CachedShaders[i].semantic_names[j] ||
+                semantic_indexes[j] != CachedShaders[i].semantic_indexes[j]) {
+               match = GL_FALSE;
+               break;
+            }
+         }
+         if (match)
+            return CachedShaders[i].handle;
+      }
+   }
+
+   /* not found - create new one now */
+   if (NumCachedShaders >= MAX_SHADERS) {
+      return NULL;
+   }
+
+   CachedShaders[i].num_attribs = num_attribs;
+   for (j = 0; j < num_attribs; j++) {
+      CachedShaders[i].semantic_names[j] = semantic_names[j];
+      CachedShaders[i].semantic_indexes[j] = semantic_indexes[j];
+   }
+
+   CachedShaders[i].handle =
+      util_make_vertex_passthrough_shader(pipe,
+                                          num_attribs,
+                                          semantic_names,
+                                          semantic_indexes);
+   NumCachedShaders++;
+
+   return CachedShaders[i].handle;
+}
+
+static void
+st_DrawTex(struct gl_context *ctx, GLfloat x, GLfloat y, GLfloat z,
+           GLfloat width, GLfloat height)
+{
+   struct st_context *st = ctx->st;
+   struct pipe_context *pipe = st->pipe;
+   struct cso_context *cso = ctx->st->cso_context;
+   struct pipe_resource *vbuffer;
+   struct pipe_transfer *vbuffer_transfer;
+   GLuint i, numTexCoords, numAttribs;
+   GLboolean emitColor;
+   uint semantic_names[2 + MAX_TEXTURE_UNITS];
+   uint semantic_indexes[2 + MAX_TEXTURE_UNITS];
+   struct pipe_vertex_element velements[2 + MAX_TEXTURE_UNITS];
+   GLbitfield inputs = VERT_BIT_POS;
+
+   st_validate_state(st);
+
+   /* determine if we need vertex color */
+   if (ctx->FragmentProgram._Current->Base.InputsRead & FRAG_BIT_COL0)
+      emitColor = GL_TRUE;
+   else
+      emitColor = GL_FALSE;
+
+   /* determine how many enabled sets of texcoords */
+   numTexCoords = 0;
+   for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+      if (ctx->Texture.Unit[i]._ReallyEnabled & TEXTURE_2D_BIT) {
+         inputs |= VERT_BIT_TEX(i);
+         numTexCoords++;
+      }
+   }
+
+   /* total number of attributes per vertex */
+   numAttribs = 1 + emitColor + numTexCoords;
+
+
+   /* create the vertex buffer */
+   vbuffer = pipe_buffer_create(pipe->screen, PIPE_BIND_VERTEX_BUFFER,
+                                PIPE_USAGE_STREAM,
+                                numAttribs * 4 * 4 * sizeof(GLfloat));
+
+   /* load vertex buffer */
+   {
+#define SET_ATTRIB(VERT, ATTR, X, Y, Z, W)                              \
+      do {                                                              \
+         GLuint k = (((VERT) * numAttribs + (ATTR)) * 4);               \
+         assert(k < 4 * 4 * numAttribs);                                \
+         vbuf[k + 0] = X;                                               \
+         vbuf[k + 1] = Y;                                               \
+         vbuf[k + 2] = Z;                                               \
+         vbuf[k + 3] = W;                                               \
+      } while (0)
+
+      const GLfloat x0 = x, y0 = y, x1 = x + width, y1 = y + height;
+      GLfloat *vbuf = (GLfloat *) pipe_buffer_map(pipe, vbuffer,
+                                                  PIPE_TRANSFER_WRITE,
+                                                  &vbuffer_transfer);
+      GLuint attr;
+      
+      z = CLAMP(z, 0.0f, 1.0f);
+
+      /* positions (in clip coords) */
+      {
+         const struct gl_framebuffer *fb = st->ctx->DrawBuffer;
+         const GLfloat fb_width = (GLfloat)fb->Width;
+         const GLfloat fb_height = (GLfloat)fb->Height;
+
+         const GLfloat clip_x0 = (GLfloat)(x0 / fb_width * 2.0 - 1.0);
+         const GLfloat clip_y0 = (GLfloat)(y0 / fb_height * 2.0 - 1.0);
+         const GLfloat clip_x1 = (GLfloat)(x1 / fb_width * 2.0 - 1.0);
+         const GLfloat clip_y1 = (GLfloat)(y1 / fb_height * 2.0 - 1.0);
+
+         SET_ATTRIB(0, 0, clip_x0, clip_y0, z, 1.0f);   /* lower left */
+         SET_ATTRIB(1, 0, clip_x1, clip_y0, z, 1.0f);   /* lower right */
+         SET_ATTRIB(2, 0, clip_x1, clip_y1, z, 1.0f);   /* upper right */
+         SET_ATTRIB(3, 0, clip_x0, clip_y1, z, 1.0f);   /* upper left */
+
+         semantic_names[0] = TGSI_SEMANTIC_POSITION;
+         semantic_indexes[0] = 0;
+      }
+
+      /* colors */
+      if (emitColor) {
+         const GLfloat *c = ctx->Current.Attrib[VERT_ATTRIB_COLOR0];
+         SET_ATTRIB(0, 1, c[0], c[1], c[2], c[3]);
+         SET_ATTRIB(1, 1, c[0], c[1], c[2], c[3]);
+         SET_ATTRIB(2, 1, c[0], c[1], c[2], c[3]);
+         SET_ATTRIB(3, 1, c[0], c[1], c[2], c[3]);
+         semantic_names[1] = TGSI_SEMANTIC_COLOR;
+         semantic_indexes[1] = 0;
+         attr = 2;
+      }
+      else {
+         attr = 1;
+      }
+
+      /* texcoords */
+      for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+         if (ctx->Texture.Unit[i]._ReallyEnabled & TEXTURE_2D_BIT) {
+            struct gl_texture_object *obj = ctx->Texture.Unit[i]._Current;
+            struct gl_texture_image *img = obj->Image[0][obj->BaseLevel];
+            const GLfloat wt = (GLfloat) img->Width;
+            const GLfloat ht = (GLfloat) img->Height;
+            const GLfloat s0 = obj->CropRect[0] / wt;
+            const GLfloat t0 = obj->CropRect[1] / ht;
+            const GLfloat s1 = (obj->CropRect[0] + obj->CropRect[2]) / wt;
+            const GLfloat t1 = (obj->CropRect[1] + obj->CropRect[3]) / ht;
+
+            /*printf("crop texcoords: %g, %g .. %g, %g\n", s0, t0, s1, t1);*/
+            SET_ATTRIB(0, attr, s0, t0, 0.0f, 1.0f);  /* lower left */
+            SET_ATTRIB(1, attr, s1, t0, 0.0f, 1.0f);  /* lower right */
+            SET_ATTRIB(2, attr, s1, t1, 0.0f, 1.0f);  /* upper right */
+            SET_ATTRIB(3, attr, s0, t1, 0.0f, 1.0f);  /* upper left */
+
+            semantic_names[attr] = TGSI_SEMANTIC_GENERIC;
+            semantic_indexes[attr] = 0;
+
+            attr++;
+         }
+      }
+
+      pipe_buffer_unmap(pipe, vbuffer_transfer);
+
+#undef SET_ATTRIB
+   }
+
+
+   cso_save_viewport(cso);
+   cso_save_vertex_shader(cso);
+   cso_save_vertex_elements(cso);
+   cso_save_vertex_buffers(cso);
+
+   {
+      void *vs = lookup_shader(pipe, numAttribs,
+                               semantic_names, semantic_indexes);
+      cso_set_vertex_shader_handle(cso, vs);
+   }
+
+   for (i = 0; i < numAttribs; i++) {
+      velements[i].src_offset = i * 4 * sizeof(float);
+      velements[i].instance_divisor = 0;
+      velements[i].vertex_buffer_index = 0;
+      velements[i].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+   }
+   cso_set_vertex_elements(cso, numAttribs, velements);
+
+   /* viewport state: viewport matching window dims */
+   {
+      const struct gl_framebuffer *fb = st->ctx->DrawBuffer;
+      const GLboolean invert = (st_fb_orientation(fb) == Y_0_TOP);
+      const GLfloat width = (GLfloat)fb->Width;
+      const GLfloat height = (GLfloat)fb->Height;
+      struct pipe_viewport_state vp;
+      vp.scale[0] =  0.5f * width;
+      vp.scale[1] = height * (invert ? -0.5f : 0.5f);
+      vp.scale[2] = 1.0f;
+      vp.scale[3] = 1.0f;
+      vp.translate[0] = 0.5f * width;
+      vp.translate[1] = 0.5f * height;
+      vp.translate[2] = 0.0f;
+      vp.translate[3] = 0.0f;
+      cso_set_viewport(cso, &vp);
+   }
+
+
+   util_draw_vertex_buffer(pipe, cso, vbuffer,
+                           0,  /* offset */
+                           PIPE_PRIM_TRIANGLE_FAN,
+                           4,  /* verts */
+                           numAttribs); /* attribs/vert */
+
+
+   pipe_resource_reference(&vbuffer, NULL);
+
+   /* restore state */
+   cso_restore_viewport(cso);
+   cso_restore_vertex_shader(cso);
+   cso_restore_vertex_elements(cso);
+   cso_restore_vertex_buffers(cso);
+}
+
+
+void
+st_init_drawtex_functions(struct dd_function_table *functions)
+{
+   functions->DrawTex = st_DrawTex;
+}
+
+
+/**
+ * Free any cached shaders
+ */
+void
+st_destroy_drawtex(struct st_context *st)
+{
+   GLuint i;
+   for (i = 0; i < NumCachedShaders; i++) {
+      cso_delete_vertex_shader(st->cso_context, CachedShaders[i].handle);
+   }
+   NumCachedShaders = 0;
+}
+
+
+#endif /* FEATURE_OES_draw_texture */
diff --git a/mesalib/src/mesa/state_tracker/st_context.c b/mesalib/src/mesa/state_tracker/st_context.c
index dccbff3c1..7a19f35bb 100644
--- a/mesalib/src/mesa/state_tracker/st_context.c
+++ b/mesalib/src/mesa/state_tracker/st_context.c
@@ -203,6 +203,11 @@ static void st_destroy_context_priv( struct st_context *st )
    st_destroy_drawpix(st);
    st_destroy_drawtex(st);
 
+   /* Unreference any user vertex buffers. */
+   for (i = 0; i < st->num_user_vbs; i++) {
+      pipe_resource_reference(&st->user_vb[i], NULL);
+   }
+
    for (i = 0; i < Elements(st->state.sampler_views); i++) {
       pipe_sampler_view_reference(&st->state.sampler_views[i], NULL);
    }
diff --git a/mesalib/src/mesa/state_tracker/st_context.h b/mesalib/src/mesa/state_tracker/st_context.h
index 492ee600e..77765f023 100644
--- a/mesalib/src/mesa/state_tracker/st_context.h
+++ b/mesalib/src/mesa/state_tracker/st_context.h
@@ -1,265 +1,270 @@
-/**************************************************************************
- * 
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef ST_CONTEXT_H
-#define ST_CONTEXT_H
-
-#include "main/mtypes.h"
-#include "pipe/p_state.h"
-#include "state_tracker/st_api.h"
-
-struct bitmap_cache;
-struct blit_state;
-struct dd_function_table;
-struct draw_context;
-struct draw_stage;
-struct gen_mipmap_state;
-struct st_context;
-struct st_fragment_program;
-
-
-#define ST_NEW_MESA                    0x1 /* Mesa state has changed */
-#define ST_NEW_FRAGMENT_PROGRAM        0x2
-#define ST_NEW_VERTEX_PROGRAM          0x4
-#define ST_NEW_FRAMEBUFFER             0x8
-#define ST_NEW_EDGEFLAGS_DATA          0x10
-#define ST_NEW_GEOMETRY_PROGRAM        0x20
-
-
-struct st_state_flags {
-   GLuint mesa;
-   GLuint st;
-};
-
-struct st_tracked_state {
-   const char *name;
-   struct st_state_flags dirty;
-   void (*update)( struct st_context *st );
-};
-
-
-
-struct st_context
-{
-   struct st_context_iface iface;
-
-   struct gl_context *ctx;
-
-   struct pipe_context *pipe;
-
-   struct draw_context *draw;  /**< For selection/feedback/rastpos only */
-   struct draw_stage *feedback_stage;  /**< For GL_FEEDBACK rendermode */
-   struct draw_stage *selection_stage;  /**< For GL_SELECT rendermode */
-   struct draw_stage *rastpos_stage;  /**< For glRasterPos */
-
-
-   /* On old libGL's for linux we need to invalidate the drawables
-    * on glViewpport calls, this is set via a option.
-    */
-   boolean invalidate_on_gl_viewport;
-
-   /* Some state is contained in constant objects.
-    * Other state is just parameter values.
-    */
-   struct {
-      struct pipe_blend_state               blend;
-      struct pipe_depth_stencil_alpha_state depth_stencil;
-      struct pipe_rasterizer_state          rasterizer;
-      struct pipe_sampler_state             samplers[PIPE_MAX_SAMPLERS];
-      struct pipe_sampler_state             *sampler_list[PIPE_MAX_SAMPLERS];
-      struct pipe_clip_state clip;
-      struct {
-         void *ptr;
-         unsigned size;
-      } constants[PIPE_SHADER_TYPES];
-      struct pipe_framebuffer_state framebuffer;
-      struct pipe_sampler_view *sampler_views[PIPE_MAX_SAMPLERS];
-      struct pipe_scissor_state scissor;
-      struct pipe_viewport_state viewport;
-      unsigned sample_mask;
-
-      GLuint num_samplers;
-      GLuint num_textures;
-
-      GLuint poly_stipple[32];  /**< In OpenGL's bottom-to-top order */
-   } state;
-
-   char vendor[100];
-   char renderer[100];
-
-   struct st_state_flags dirty;
-
-   GLboolean missing_textures;
-   GLboolean vertdata_edgeflags;
-
-   /** Mapping from VERT_RESULT_x to post-transformed vertex slot */
-   const GLuint *vertex_result_to_slot;
-
-   struct st_vertex_program *vp;    /**< Currently bound vertex program */
-   struct st_fragment_program *fp;  /**< Currently bound fragment program */
-   struct st_geometry_program *gp;  /**< Currently bound geometry program */
-
-   struct st_vp_variant *vp_variant;
-   struct st_fp_variant *fp_variant;
-   struct st_gp_variant *gp_variant;
-
-   struct gl_texture_object *default_texture;
-
-   struct {
-      struct gl_program_cache *cache;
-      struct st_fragment_program *program;  /**< cur pixel transfer prog */
-      GLuint xfer_prog_sn;  /**< pixel xfer program serial no. */
-      GLuint user_prog_sn;  /**< user fragment program serial no. */
-      struct st_fragment_program *combined_prog;
-      GLuint combined_prog_sn;
-      struct pipe_resource *pixelmap_texture;
-      struct pipe_sampler_view *pixelmap_sampler_view;
-      boolean pixelmap_enabled;  /**< use the pixelmap texture? */
-   } pixel_xfer;
-
-   /** for glBitmap */
-   struct {
-      struct pipe_rasterizer_state rasterizer;
-      struct pipe_sampler_state samplers[2];
-      enum pipe_format tex_format;
-      void *vs;
-      float vertices[4][3][4];  /**< vertex pos + color + texcoord */
-      struct pipe_resource *vbuf;
-      unsigned vbuf_slot;       /* next free slot in vbuf */
-      struct bitmap_cache *cache;
-   } bitmap;
-
-   /** for glDraw/CopyPixels */
-   struct {
-      struct gl_fragment_program *shaders[4];
-      void *vert_shaders[2];   /**< ureg shaders */
-   } drawpix;
-
-   /** for glClear */
-   struct {
-      struct pipe_rasterizer_state raster;
-      struct pipe_viewport_state viewport;
-      struct pipe_clip_state clip;
-      void *vs;
-      void *fs;
-      float vertices[4][2][4];  /**< vertex pos + color */
-      struct pipe_resource *vbuf;
-      unsigned vbuf_slot;
-      boolean enable_ds_separate;
-   } clear;
-
-   /** used for anything using util_draw_vertex_buffer */
-   struct pipe_vertex_element velems_util_draw[3];
-
-   void *passthrough_fs;  /**< simple pass-through frag shader */
-
-   enum pipe_texture_target internal_target;
-   struct gen_mipmap_state *gen_mipmap;
-   struct blit_state *blit;
-
-   struct cso_context *cso_context;
-
-   int force_msaa;
-   void *winsys_drawable_handle;
-};
-
-
-/* Need this so that we can implement Mesa callbacks in this module.
- */
-static INLINE struct st_context *st_context(struct gl_context *ctx)
-{
-   return ctx->st;
-}
-
-
-/**
- * Wrapper for struct gl_framebuffer.
- * This is an opaque type to the outside world.
- */
-struct st_framebuffer
-{
-   struct gl_framebuffer Base;
-   void *Private;
-
-   struct st_framebuffer_iface *iface;
-   enum st_attachment_type statts[ST_ATTACHMENT_COUNT];
-   unsigned num_statts;
-   int32_t revalidate;
-};
-
-
-extern void st_init_driver_functions(struct dd_function_table *functions);
-
-void st_invalidate_state(struct gl_context * ctx, GLuint new_state);
-
-
-
-#define Y_0_TOP 1
-#define Y_0_BOTTOM 2
-
-static INLINE GLuint
-st_fb_orientation(const struct gl_framebuffer *fb)
-{
-   if (fb && fb->Name == 0) {
-      /* Drawing into a window (on-screen buffer).
-       *
-       * Negate Y scale to flip image vertically.
-       * The NDC Y coords prior to viewport transformation are in the range
-       * [y=-1=bottom, y=1=top]
-       * Hardware window coords are in the range [y=0=top, y=H-1=bottom] where
-       * H is the window height.
-       * Use the viewport transformation to invert Y.
-       */
-      return Y_0_TOP;
-   }
-   else {
-      /* Drawing into user-created FBO (very likely a texture).
-       *
-       * For textures, T=0=Bottom, so by extension Y=0=Bottom for rendering.
-       */
-      return Y_0_BOTTOM;
-   }
-}
-
-
-/** clear-alloc a struct-sized object, with casting */
-#define ST_CALLOC_STRUCT(T)   (struct T *) calloc(1, sizeof(struct T))
-
-
-extern int
-st_get_msaa(void);
-
-extern struct st_context *
-st_create_context(gl_api api, struct pipe_context *pipe,
-                  const struct gl_config *visual,
-                  struct st_context *share);
-
-extern void
-st_destroy_context(struct st_context *st);
-
-
-#endif
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef ST_CONTEXT_H
+#define ST_CONTEXT_H
+
+#include "main/mtypes.h"
+#include "pipe/p_state.h"
+#include "state_tracker/st_api.h"
+
+struct bitmap_cache;
+struct blit_state;
+struct dd_function_table;
+struct draw_context;
+struct draw_stage;
+struct gen_mipmap_state;
+struct st_context;
+struct st_fragment_program;
+
+
+#define ST_NEW_MESA                    0x1 /* Mesa state has changed */
+#define ST_NEW_FRAGMENT_PROGRAM        0x2
+#define ST_NEW_VERTEX_PROGRAM          0x4
+#define ST_NEW_FRAMEBUFFER             0x8
+#define ST_NEW_EDGEFLAGS_DATA          0x10
+#define ST_NEW_GEOMETRY_PROGRAM        0x20
+
+
+struct st_state_flags {
+   GLuint mesa;
+   GLuint st;
+};
+
+struct st_tracked_state {
+   const char *name;
+   struct st_state_flags dirty;
+   void (*update)( struct st_context *st );
+};
+
+
+
+struct st_context
+{
+   struct st_context_iface iface;
+
+   struct gl_context *ctx;
+
+   struct pipe_context *pipe;
+
+   struct draw_context *draw;  /**< For selection/feedback/rastpos only */
+   struct draw_stage *feedback_stage;  /**< For GL_FEEDBACK rendermode */
+   struct draw_stage *selection_stage;  /**< For GL_SELECT rendermode */
+   struct draw_stage *rastpos_stage;  /**< For glRasterPos */
+
+
+   /* On old libGL's for linux we need to invalidate the drawables
+    * on glViewpport calls, this is set via a option.
+    */
+   boolean invalidate_on_gl_viewport;
+
+   /* Some state is contained in constant objects.
+    * Other state is just parameter values.
+    */
+   struct {
+      struct pipe_blend_state               blend;
+      struct pipe_depth_stencil_alpha_state depth_stencil;
+      struct pipe_rasterizer_state          rasterizer;
+      struct pipe_sampler_state             samplers[PIPE_MAX_SAMPLERS];
+      struct pipe_sampler_state             *sampler_list[PIPE_MAX_SAMPLERS];
+      struct pipe_clip_state clip;
+      struct {
+         void *ptr;
+         unsigned size;
+      } constants[PIPE_SHADER_TYPES];
+      struct pipe_framebuffer_state framebuffer;
+      struct pipe_sampler_view *sampler_views[PIPE_MAX_SAMPLERS];
+      struct pipe_scissor_state scissor;
+      struct pipe_viewport_state viewport;
+      unsigned sample_mask;
+
+      GLuint num_samplers;
+      GLuint num_textures;
+
+      GLuint poly_stipple[32];  /**< In OpenGL's bottom-to-top order */
+   } state;
+
+   char vendor[100];
+   char renderer[100];
+
+   struct st_state_flags dirty;
+
+   GLboolean missing_textures;
+   GLboolean vertdata_edgeflags;
+
+   /** Mapping from VERT_RESULT_x to post-transformed vertex slot */
+   const GLuint *vertex_result_to_slot;
+
+   struct st_vertex_program *vp;    /**< Currently bound vertex program */
+   struct st_fragment_program *fp;  /**< Currently bound fragment program */
+   struct st_geometry_program *gp;  /**< Currently bound geometry program */
+
+   struct st_vp_variant *vp_variant;
+   struct st_fp_variant *fp_variant;
+   struct st_gp_variant *gp_variant;
+
+   struct gl_texture_object *default_texture;
+
+   struct {
+      struct gl_program_cache *cache;
+      struct st_fragment_program *program;  /**< cur pixel transfer prog */
+      GLuint xfer_prog_sn;  /**< pixel xfer program serial no. */
+      GLuint user_prog_sn;  /**< user fragment program serial no. */
+      struct st_fragment_program *combined_prog;
+      GLuint combined_prog_sn;
+      struct pipe_resource *pixelmap_texture;
+      struct pipe_sampler_view *pixelmap_sampler_view;
+      boolean pixelmap_enabled;  /**< use the pixelmap texture? */
+   } pixel_xfer;
+
+   /** for glBitmap */
+   struct {
+      struct pipe_rasterizer_state rasterizer;
+      struct pipe_sampler_state samplers[2];
+      enum pipe_format tex_format;
+      void *vs;
+      float vertices[4][3][4];  /**< vertex pos + color + texcoord */
+      struct pipe_resource *vbuf;
+      unsigned vbuf_slot;       /* next free slot in vbuf */
+      struct bitmap_cache *cache;
+   } bitmap;
+
+   /** for glDraw/CopyPixels */
+   struct {
+      struct gl_fragment_program *shaders[4];
+      void *vert_shaders[2];   /**< ureg shaders */
+   } drawpix;
+
+   /** for glClear */
+   struct {
+      struct pipe_rasterizer_state raster;
+      struct pipe_viewport_state viewport;
+      struct pipe_clip_state clip;
+      void *vs;
+      void *fs;
+      float vertices[4][2][4];  /**< vertex pos + color */
+      struct pipe_resource *vbuf;
+      unsigned vbuf_slot;
+      boolean enable_ds_separate;
+   } clear;
+
+   /** used for anything using util_draw_vertex_buffer */
+   struct pipe_vertex_element velems_util_draw[3];
+
+   void *passthrough_fs;  /**< simple pass-through frag shader */
+
+   enum pipe_texture_target internal_target;
+   struct gen_mipmap_state *gen_mipmap;
+   struct blit_state *blit;
+
+   struct cso_context *cso_context;
+
+   int force_msaa;
+   void *winsys_drawable_handle;
+
+   /* User vertex buffers. */
+   struct pipe_resource *user_vb[PIPE_MAX_ATTRIBS];
+   unsigned user_vb_stride[PIPE_MAX_ATTRIBS];
+   unsigned num_user_vbs;
+};
+
+
+/* Need this so that we can implement Mesa callbacks in this module.
+ */
+static INLINE struct st_context *st_context(struct gl_context *ctx)
+{
+   return ctx->st;
+}
+
+
+/**
+ * Wrapper for struct gl_framebuffer.
+ * This is an opaque type to the outside world.
+ */
+struct st_framebuffer
+{
+   struct gl_framebuffer Base;
+   void *Private;
+
+   struct st_framebuffer_iface *iface;
+   enum st_attachment_type statts[ST_ATTACHMENT_COUNT];
+   unsigned num_statts;
+   int32_t revalidate;
+};
+
+
+extern void st_init_driver_functions(struct dd_function_table *functions);
+
+void st_invalidate_state(struct gl_context * ctx, GLuint new_state);
+
+
+
+#define Y_0_TOP 1
+#define Y_0_BOTTOM 2
+
+static INLINE GLuint
+st_fb_orientation(const struct gl_framebuffer *fb)
+{
+   if (fb && fb->Name == 0) {
+      /* Drawing into a window (on-screen buffer).
+       *
+       * Negate Y scale to flip image vertically.
+       * The NDC Y coords prior to viewport transformation are in the range
+       * [y=-1=bottom, y=1=top]
+       * Hardware window coords are in the range [y=0=top, y=H-1=bottom] where
+       * H is the window height.
+       * Use the viewport transformation to invert Y.
+       */
+      return Y_0_TOP;
+   }
+   else {
+      /* Drawing into user-created FBO (very likely a texture).
+       *
+       * For textures, T=0=Bottom, so by extension Y=0=Bottom for rendering.
+       */
+      return Y_0_BOTTOM;
+   }
+}
+
+
+/** clear-alloc a struct-sized object, with casting */
+#define ST_CALLOC_STRUCT(T)   (struct T *) calloc(1, sizeof(struct T))
+
+
+extern int
+st_get_msaa(void);
+
+extern struct st_context *
+st_create_context(gl_api api, struct pipe_context *pipe,
+                  const struct gl_config *visual,
+                  struct st_context *share);
+
+extern void
+st_destroy_context(struct st_context *st);
+
+
+#endif
diff --git a/mesalib/src/mesa/state_tracker/st_draw.c b/mesalib/src/mesa/state_tracker/st_draw.c
index 19466ea44..830e3e3c1 100644
--- a/mesalib/src/mesa/state_tracker/st_draw.c
+++ b/mesalib/src/mesa/state_tracker/st_draw.c
@@ -1,788 +1,749 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/*
- * This file implements the st_draw_vbo() function which is called from
- * Mesa's VBO module.  All point/line/triangle rendering is done through
- * this function whether the user called glBegin/End, glDrawArrays,
- * glDrawElements, glEvalMesh, or glCalList, etc.
- *
- * We basically convert the VBO's vertex attribute/array information into
- * Gallium vertex state, bind the vertex buffer objects and call
- * pipe->draw_elements(), pipe->draw_range_elements() or pipe->draw_arrays().
- *
- * Authors:
- *   Keith Whitwell <keith@tungstengraphics.com>
- */
-
-
-#include "main/imports.h"
-#include "main/image.h"
-#include "main/macros.h"
-#include "main/mfeatures.h"
-#include "program/prog_uniform.h"
-
-#include "vbo/vbo.h"
-
-#include "st_context.h"
-#include "st_atom.h"
-#include "st_cb_bufferobjects.h"
-#include "st_draw.h"
-#include "st_program.h"
-
-#include "pipe/p_context.h"
-#include "pipe/p_defines.h"
-#include "util/u_inlines.h"
-#include "util/u_format.h"
-#include "util/u_prim.h"
-#include "util/u_draw_quad.h"
-#include "draw/draw_context.h"
-#include "cso_cache/cso_context.h"
-
-
-static GLuint double_types[4] = {
-   PIPE_FORMAT_R64_FLOAT,
-   PIPE_FORMAT_R64G64_FLOAT,
-   PIPE_FORMAT_R64G64B64_FLOAT,
-   PIPE_FORMAT_R64G64B64A64_FLOAT
-};
-
-static GLuint float_types[4] = {
-   PIPE_FORMAT_R32_FLOAT,
-   PIPE_FORMAT_R32G32_FLOAT,
-   PIPE_FORMAT_R32G32B32_FLOAT,
-   PIPE_FORMAT_R32G32B32A32_FLOAT
-};
-
-static GLuint half_float_types[4] = {
-   PIPE_FORMAT_R16_FLOAT,
-   PIPE_FORMAT_R16G16_FLOAT,
-   PIPE_FORMAT_R16G16B16_FLOAT,
-   PIPE_FORMAT_R16G16B16A16_FLOAT
-};
-
-static GLuint uint_types_norm[4] = {
-   PIPE_FORMAT_R32_UNORM,
-   PIPE_FORMAT_R32G32_UNORM,
-   PIPE_FORMAT_R32G32B32_UNORM,
-   PIPE_FORMAT_R32G32B32A32_UNORM
-};
-
-static GLuint uint_types_scale[4] = {
-   PIPE_FORMAT_R32_USCALED,
-   PIPE_FORMAT_R32G32_USCALED,
-   PIPE_FORMAT_R32G32B32_USCALED,
-   PIPE_FORMAT_R32G32B32A32_USCALED
-};
-
-static GLuint int_types_norm[4] = {
-   PIPE_FORMAT_R32_SNORM,
-   PIPE_FORMAT_R32G32_SNORM,
-   PIPE_FORMAT_R32G32B32_SNORM,
-   PIPE_FORMAT_R32G32B32A32_SNORM
-};
-
-static GLuint int_types_scale[4] = {
-   PIPE_FORMAT_R32_SSCALED,
-   PIPE_FORMAT_R32G32_SSCALED,
-   PIPE_FORMAT_R32G32B32_SSCALED,
-   PIPE_FORMAT_R32G32B32A32_SSCALED
-};
-
-static GLuint ushort_types_norm[4] = {
-   PIPE_FORMAT_R16_UNORM,
-   PIPE_FORMAT_R16G16_UNORM,
-   PIPE_FORMAT_R16G16B16_UNORM,
-   PIPE_FORMAT_R16G16B16A16_UNORM
-};
-
-static GLuint ushort_types_scale[4] = {
-   PIPE_FORMAT_R16_USCALED,
-   PIPE_FORMAT_R16G16_USCALED,
-   PIPE_FORMAT_R16G16B16_USCALED,
-   PIPE_FORMAT_R16G16B16A16_USCALED
-};
-
-static GLuint short_types_norm[4] = {
-   PIPE_FORMAT_R16_SNORM,
-   PIPE_FORMAT_R16G16_SNORM,
-   PIPE_FORMAT_R16G16B16_SNORM,
-   PIPE_FORMAT_R16G16B16A16_SNORM
-};
-
-static GLuint short_types_scale[4] = {
-   PIPE_FORMAT_R16_SSCALED,
-   PIPE_FORMAT_R16G16_SSCALED,
-   PIPE_FORMAT_R16G16B16_SSCALED,
-   PIPE_FORMAT_R16G16B16A16_SSCALED
-};
-
-static GLuint ubyte_types_norm[4] = {
-   PIPE_FORMAT_R8_UNORM,
-   PIPE_FORMAT_R8G8_UNORM,
-   PIPE_FORMAT_R8G8B8_UNORM,
-   PIPE_FORMAT_R8G8B8A8_UNORM
-};
-
-static GLuint ubyte_types_scale[4] = {
-   PIPE_FORMAT_R8_USCALED,
-   PIPE_FORMAT_R8G8_USCALED,
-   PIPE_FORMAT_R8G8B8_USCALED,
-   PIPE_FORMAT_R8G8B8A8_USCALED
-};
-
-static GLuint byte_types_norm[4] = {
-   PIPE_FORMAT_R8_SNORM,
-   PIPE_FORMAT_R8G8_SNORM,
-   PIPE_FORMAT_R8G8B8_SNORM,
-   PIPE_FORMAT_R8G8B8A8_SNORM
-};
-
-static GLuint byte_types_scale[4] = {
-   PIPE_FORMAT_R8_SSCALED,
-   PIPE_FORMAT_R8G8_SSCALED,
-   PIPE_FORMAT_R8G8B8_SSCALED,
-   PIPE_FORMAT_R8G8B8A8_SSCALED
-};
-
-static GLuint fixed_types[4] = {
-   PIPE_FORMAT_R32_FIXED,
-   PIPE_FORMAT_R32G32_FIXED,
-   PIPE_FORMAT_R32G32B32_FIXED,
-   PIPE_FORMAT_R32G32B32A32_FIXED
-};
-
-
-
-/**
- * Return a PIPE_FORMAT_x for the given GL datatype and size.
- */
-GLuint
-st_pipe_vertex_format(GLenum type, GLuint size, GLenum format,
-                      GLboolean normalized)
-{
-   assert((type >= GL_BYTE && type <= GL_DOUBLE) ||
-          type == GL_FIXED || type == GL_HALF_FLOAT);
-   assert(size >= 1);
-   assert(size <= 4);
-   assert(format == GL_RGBA || format == GL_BGRA);
-
-   if (format == GL_BGRA) {
-      /* this is an odd-ball case */
-      assert(type == GL_UNSIGNED_BYTE);
-      assert(normalized);
-      return PIPE_FORMAT_B8G8R8A8_UNORM;
-   }
-
-   if (normalized) {
-      switch (type) {
-      case GL_DOUBLE: return double_types[size-1];
-      case GL_FLOAT: return float_types[size-1];
-      case GL_HALF_FLOAT: return half_float_types[size-1];
-      case GL_INT: return int_types_norm[size-1];
-      case GL_SHORT: return short_types_norm[size-1];
-      case GL_BYTE: return byte_types_norm[size-1];
-      case GL_UNSIGNED_INT: return uint_types_norm[size-1];
-      case GL_UNSIGNED_SHORT: return ushort_types_norm[size-1];
-      case GL_UNSIGNED_BYTE: return ubyte_types_norm[size-1];
-      case GL_FIXED: return fixed_types[size-1];
-      default: assert(0); return 0;
-      }      
-   }
-   else {
-      switch (type) {
-      case GL_DOUBLE: return double_types[size-1];
-      case GL_FLOAT: return float_types[size-1];
-      case GL_HALF_FLOAT: return half_float_types[size-1];
-      case GL_INT: return int_types_scale[size-1];
-      case GL_SHORT: return short_types_scale[size-1];
-      case GL_BYTE: return byte_types_scale[size-1];
-      case GL_UNSIGNED_INT: return uint_types_scale[size-1];
-      case GL_UNSIGNED_SHORT: return ushort_types_scale[size-1];
-      case GL_UNSIGNED_BYTE: return ubyte_types_scale[size-1];
-      case GL_FIXED: return fixed_types[size-1];
-      default: assert(0); return 0;
-      }      
-   }
-   return 0; /* silence compiler warning */
-}
-
-
-
-
-
-/**
- * Examine the active arrays to determine if we have interleaved
- * vertex arrays all living in one VBO, or all living in user space.
- * \param userSpace  returns whether the arrays are in user space.
- */
-static GLboolean
-is_interleaved_arrays(const struct st_vertex_program *vp,
-                      const struct st_vp_variant *vpv,
-                      const struct gl_client_array **arrays,
-                      GLboolean *userSpace)
-{
-   GLuint attr;
-   const struct gl_buffer_object *firstBufObj = NULL;
-   GLint firstStride = -1;
-   GLuint num_client_arrays = 0;
-   const GLubyte *client_addr = NULL;
-
-   for (attr = 0; attr < vpv->num_inputs; attr++) {
-      const GLuint mesaAttr = vp->index_to_input[attr];
-      const struct gl_buffer_object *bufObj = arrays[mesaAttr]->BufferObj;
-      const GLsizei stride = arrays[mesaAttr]->StrideB; /* in bytes */
-
-      if (firstStride < 0) {
-         firstStride = stride;
-      }
-      else if (firstStride != stride) {
-         return GL_FALSE;
-      }
-         
-      if (!bufObj || !bufObj->Name) {
-         num_client_arrays++;
-         /* Try to detect if the client-space arrays are
-          * "close" to each other.
-          */
-         if (!client_addr) {
-            client_addr = arrays[mesaAttr]->Ptr;
-         }
-         else if (abs(arrays[mesaAttr]->Ptr - client_addr) > firstStride) {
-            /* arrays start too far apart */
-            return GL_FALSE;
-         }
-      }
-      else if (!firstBufObj) {
-         firstBufObj = bufObj;
-      }
-      else if (bufObj != firstBufObj) {
-         return GL_FALSE;
-      }
-   }
-
-   *userSpace = (num_client_arrays == vpv->num_inputs);
-   /* debug_printf("user space: %s (%d arrays, %d inputs)\n",
-      (int)*userSpace ? "Yes" : "No", num_client_arrays, vp->num_inputs); */
-
-   return GL_TRUE;
-}
-
-
-/**
- * Compute the memory range occupied by the arrays.
- */
-static void
-get_arrays_bounds(const struct st_vertex_program *vp,
-                  const struct st_vp_variant *vpv,
-                  const struct gl_client_array **arrays,
-                  GLuint max_index,
-                  const GLubyte **low, const GLubyte **high)
-{
-   const GLubyte *low_addr = NULL;
-   const GLubyte *high_addr = NULL;
-   GLuint attr;
-
-   /* debug_printf("get_arrays_bounds: Handling %u attrs\n", vpv->num_inputs); */
-
-   for (attr = 0; attr < vpv->num_inputs; attr++) {
-      const GLuint mesaAttr = vp->index_to_input[attr];
-      const GLint stride = arrays[mesaAttr]->StrideB;
-      const GLubyte *start = arrays[mesaAttr]->Ptr;
-      const unsigned sz = (arrays[mesaAttr]->Size * 
-                           _mesa_sizeof_type(arrays[mesaAttr]->Type));
-      const GLubyte *end = start + (max_index * stride) + sz;
-
-      /* debug_printf("attr %u: stride %d size %u start %p end %p\n",
-         attr, stride, sz, start, end); */
-
-      if (attr == 0) {
-         low_addr = start;
-         high_addr = end;
-      }
-      else {
-         low_addr = MIN2(low_addr, start);
-         high_addr = MAX2(high_addr, end);
-      }
-   }
-
-   *low = low_addr;
-   *high = high_addr;
-}
-
-
-/**
- * Set up for drawing interleaved arrays that all live in one VBO
- * or all live in user space.
- * \param vbuffer  returns vertex buffer info
- * \param velements  returns vertex element info
- */
-static void
-setup_interleaved_attribs(struct gl_context *ctx,
-                          const struct st_vertex_program *vp,
-                          const struct st_vp_variant *vpv,
-                          const struct gl_client_array **arrays,
-                          GLuint max_index,
-                          GLboolean userSpace,
-                          struct pipe_vertex_buffer *vbuffer,
-                          struct pipe_vertex_element velements[])
-{
-   struct st_context *st = st_context(ctx);
-   struct pipe_context *pipe = st->pipe;
-   GLuint attr;
-   const GLubyte *offset0 = NULL;
-
-   for (attr = 0; attr < vpv->num_inputs; attr++) {
-      const GLuint mesaAttr = vp->index_to_input[attr];
-      struct gl_buffer_object *bufobj = arrays[mesaAttr]->BufferObj;
-      struct st_buffer_object *stobj = st_buffer_object(bufobj);
-      GLsizei stride = arrays[mesaAttr]->StrideB;
-
-      /*printf("stobj %u = %p\n", attr, (void*)stobj);*/
-
-      if (attr == 0) {
-         const GLubyte *low, *high;
-
-         get_arrays_bounds(vp, vpv, arrays, max_index, &low, &high);
-         /* debug_printf("buffer range: %p %p range %d max index %u\n",
-            low, high, high - low, max_index); */
-
-         offset0 = low;
-         if (userSpace) {
-            vbuffer->buffer =
-               pipe_user_buffer_create(pipe->screen, (void *) low, high - low,
-				       PIPE_BIND_VERTEX_BUFFER);
-            vbuffer->buffer_offset = 0;
-         }
-         else {
-            vbuffer->buffer = NULL;
-            pipe_resource_reference(&vbuffer->buffer, stobj->buffer);
-            vbuffer->buffer_offset = pointer_to_offset(low);
-         }
-         vbuffer->stride = stride; /* in bytes */
-         vbuffer->max_index = max_index;
-      }
-
-      /*
-      if (arrays[mesaAttr]->InstanceDivisor)
-         vbuffer[attr].max_index = arrays[mesaAttr]->_MaxElement;
-      else
-         vbuffer[attr].max_index = max_index;
-      */
-
-      velements[attr].src_offset =
-         (unsigned) (arrays[mesaAttr]->Ptr - offset0);
-      velements[attr].instance_divisor = arrays[mesaAttr]->InstanceDivisor;
-      velements[attr].vertex_buffer_index = 0;
-      velements[attr].src_format =
-         st_pipe_vertex_format(arrays[mesaAttr]->Type,
-                               arrays[mesaAttr]->Size,
-                               arrays[mesaAttr]->Format,
-                               arrays[mesaAttr]->Normalized);
-      assert(velements[attr].src_format);
-   }
-}
-
-
-/**
- * Set up a separate pipe_vertex_buffer and pipe_vertex_element for each
- * vertex attribute.
- * \param vbuffer  returns vertex buffer info
- * \param velements  returns vertex element info
- */
-static void
-setup_non_interleaved_attribs(struct gl_context *ctx,
-                              const struct st_vertex_program *vp,
-                              const struct st_vp_variant *vpv,
-                              const struct gl_client_array **arrays,
-                              GLuint max_index,
-                              GLboolean *userSpace,
-                              struct pipe_vertex_buffer vbuffer[],
-                              struct pipe_vertex_element velements[])
-{
-   struct st_context *st = st_context(ctx);
-   struct pipe_context *pipe = st->pipe;
-   GLuint attr;
-
-   for (attr = 0; attr < vpv->num_inputs; attr++) {
-      const GLuint mesaAttr = vp->index_to_input[attr];
-      struct gl_buffer_object *bufobj = arrays[mesaAttr]->BufferObj;
-      GLsizei stride = arrays[mesaAttr]->StrideB;
-
-      *userSpace = GL_FALSE;
-
-      if (bufobj && bufobj->Name) {
-         /* Attribute data is in a VBO.
-          * Recall that for VBOs, the gl_client_array->Ptr field is
-          * really an offset from the start of the VBO, not a pointer.
-          */
-         struct st_buffer_object *stobj = st_buffer_object(bufobj);
-         assert(stobj->buffer);
-         /*printf("stobj %u = %p\n", attr, (void*) stobj);*/
-
-         vbuffer[attr].buffer = NULL;
-         pipe_resource_reference(&vbuffer[attr].buffer, stobj->buffer);
-         vbuffer[attr].buffer_offset = pointer_to_offset(arrays[mesaAttr]->Ptr);
-      }
-      else {
-         /* attribute data is in user-space memory, not a VBO */
-         uint bytes;
-         /*printf("user-space array %d stride %d\n", attr, stride);*/
-	
-         *userSpace = GL_TRUE;
-
-         /* wrap user data */
-         if (arrays[mesaAttr]->Ptr) {
-            /* user's vertex array */
-            if (arrays[mesaAttr]->StrideB) {
-               bytes = arrays[mesaAttr]->StrideB * (max_index + 1);
-            }
-            else {
-               bytes = arrays[mesaAttr]->Size
-                  * _mesa_sizeof_type(arrays[mesaAttr]->Type);
-            }
-            vbuffer[attr].buffer = 
-	       pipe_user_buffer_create(pipe->screen,
-				       (void *) arrays[mesaAttr]->Ptr, bytes,
-				       PIPE_BIND_VERTEX_BUFFER);
-         }
-         else {
-            /* no array, use ctx->Current.Attrib[] value */
-            bytes = sizeof(ctx->Current.Attrib[0]);
-            vbuffer[attr].buffer = 
-	       pipe_user_buffer_create(pipe->screen,
-				       (void *) ctx->Current.Attrib[mesaAttr],
-				       bytes,
-				       PIPE_BIND_VERTEX_BUFFER);
-            stride = 0;
-         }
-
-         vbuffer[attr].buffer_offset = 0;
-      }
-
-      assert(velements[attr].src_offset <= 2048); /* 11-bit field */
-
-      /* common-case setup */
-      vbuffer[attr].stride = stride; /* in bytes */
-      if (arrays[mesaAttr]->InstanceDivisor)
-         vbuffer[attr].max_index = arrays[mesaAttr]->_MaxElement;
-      else
-         vbuffer[attr].max_index = max_index;
-
-      velements[attr].src_offset = 0;
-      velements[attr].instance_divisor = arrays[mesaAttr]->InstanceDivisor;
-      velements[attr].vertex_buffer_index = attr;
-      velements[attr].src_format
-         = st_pipe_vertex_format(arrays[mesaAttr]->Type,
-                                 arrays[mesaAttr]->Size,
-                                 arrays[mesaAttr]->Format,
-                                 arrays[mesaAttr]->Normalized);
-      assert(velements[attr].src_format);
-   }
-}
-
-
-static void
-setup_index_buffer(struct gl_context *ctx,
-                   const struct _mesa_index_buffer *ib,
-                   struct pipe_index_buffer *ibuffer)
-{
-   struct st_context *st = st_context(ctx);
-   struct pipe_context *pipe = st->pipe;
-
-   memset(ibuffer, 0, sizeof(*ibuffer));
-   if (ib) {
-      struct gl_buffer_object *bufobj = ib->obj;
-
-      switch (ib->type) {
-      case GL_UNSIGNED_INT:
-         ibuffer->index_size = 4;
-         break;
-      case GL_UNSIGNED_SHORT:
-         ibuffer->index_size = 2;
-         break;
-      case GL_UNSIGNED_BYTE:
-         ibuffer->index_size = 1;
-         break;
-      default:
-         assert(0);
-	 return;
-      }
-
-      /* get/create the index buffer object */
-      if (bufobj && bufobj->Name) {
-         /* elements/indexes are in a real VBO */
-         struct st_buffer_object *stobj = st_buffer_object(bufobj);
-         pipe_resource_reference(&ibuffer->buffer, stobj->buffer);
-         ibuffer->offset = pointer_to_offset(ib->ptr);
-      }
-      else {
-         /* element/indicies are in user space memory */
-         ibuffer->buffer =
-            pipe_user_buffer_create(pipe->screen, (void *) ib->ptr,
-                                    ib->count * ibuffer->index_size,
-                                    PIPE_BIND_INDEX_BUFFER);
-      }
-   }
-}
-
-/**
- * Prior to drawing, check that any uniforms referenced by the
- * current shader have been set.  If a uniform has not been set,
- * issue a warning.
- */
-static void
-check_uniforms(struct gl_context *ctx)
-{
-   struct gl_shader_program *shProg[3] = {
-      ctx->Shader.CurrentVertexProgram,
-      ctx->Shader.CurrentGeometryProgram,
-      ctx->Shader.CurrentFragmentProgram,
-   };
-   unsigned j;
-
-   for (j = 0; j < 3; j++) {
-      unsigned i;
-
-      if (shProg[j] == NULL || !shProg[j]->LinkStatus)
-	 continue;
-
-      for (i = 0; i < shProg[j]->Uniforms->NumUniforms; i++) {
-         const struct gl_uniform *u = &shProg[j]->Uniforms->Uniforms[i];
-         if (!u->Initialized) {
-            _mesa_warning(ctx,
-                          "Using shader with uninitialized uniform: %s",
-                          u->Name);
-         }
-      }
-   }
-}
-
-
-/**
- * Translate OpenGL primtive type (GL_POINTS, GL_TRIANGLE_STRIP, etc) to
- * the corresponding Gallium type.
- */
-static unsigned
-translate_prim(const struct gl_context *ctx, unsigned prim)
-{
-   /* GL prims should match Gallium prims, spot-check a few */
-   assert(GL_POINTS == PIPE_PRIM_POINTS);
-   assert(GL_QUADS == PIPE_PRIM_QUADS);
-   assert(GL_TRIANGLE_STRIP_ADJACENCY == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY);
-
-   /* Avoid quadstrips if it's easy to do so:
-    * Note: it's imporant to do the correct trimming if we change the prim type!
-    * We do that wherever this function is called.
-    */
-   if (prim == GL_QUAD_STRIP &&
-       ctx->Light.ShadeModel != GL_FLAT &&
-       ctx->Polygon.FrontMode == GL_FILL &&
-       ctx->Polygon.BackMode == GL_FILL)
-      prim = GL_TRIANGLE_STRIP;
-
-   return prim;
-}
-
-
-
-/**
- * This function gets plugged into the VBO module and is called when
- * we have something to render.
- * Basically, translate the information into the format expected by gallium.
- */
-void
-st_draw_vbo(struct gl_context *ctx,
-            const struct gl_client_array **arrays,
-            const struct _mesa_prim *prims,
-            GLuint nr_prims,
-            const struct _mesa_index_buffer *ib,
-	    GLboolean index_bounds_valid,
-            GLuint min_index,
-            GLuint max_index)
-{
-   struct st_context *st = st_context(ctx);
-   struct pipe_context *pipe = st->pipe;
-   const struct st_vertex_program *vp;
-   const struct st_vp_variant *vpv;
-   struct pipe_vertex_buffer vbuffer[PIPE_MAX_SHADER_INPUTS];
-   GLuint attr;
-   struct pipe_vertex_element velements[PIPE_MAX_ATTRIBS];
-   unsigned num_vbuffers, num_velements;
-   struct pipe_index_buffer ibuffer;
-   GLboolean userSpace = GL_FALSE;
-   GLboolean vertDataEdgeFlags;
-   struct pipe_draw_info info;
-   unsigned i;
-
-   /* Mesa core state should have been validated already */
-   assert(ctx->NewState == 0x0);
-
-   /* Gallium probably doesn't want this in some cases. */
-   if (!index_bounds_valid)
-      if (!vbo_all_varyings_in_vbos(arrays))
-	 vbo_get_minmax_index(ctx, prims, ib, &min_index, &max_index);
-
-   /* sanity check for pointer arithmetic below */
-   assert(sizeof(arrays[0]->Ptr[0]) == 1);
-
-   vertDataEdgeFlags = arrays[VERT_ATTRIB_EDGEFLAG]->BufferObj &&
-                       arrays[VERT_ATTRIB_EDGEFLAG]->BufferObj->Name;
-   if (vertDataEdgeFlags != st->vertdata_edgeflags) {
-      st->vertdata_edgeflags = vertDataEdgeFlags;
-      st->dirty.st |= ST_NEW_EDGEFLAGS_DATA;
-   }
-
-   st_validate_state(st);
-
-   /* must get these after state validation! */
-   vp = st->vp;
-   vpv = st->vp_variant;
-
-#if 0
-   if (MESA_VERBOSE & VERBOSE_GLSL) {
-      check_uniforms(ctx);
-   }
-#else
-   (void) check_uniforms;
-#endif
-
-   memset(velements, 0, sizeof(struct pipe_vertex_element) * vpv->num_inputs);
-   /*
-    * Setup the vbuffer[] and velements[] arrays.
-    */
-   if (is_interleaved_arrays(vp, vpv, arrays, &userSpace)) {
-      /*printf("Draw interleaved\n");*/
-      setup_interleaved_attribs(ctx, vp, vpv, arrays, max_index, userSpace,
-                                vbuffer, velements);
-      num_vbuffers = 1;
-      num_velements = vpv->num_inputs;
-      if (num_velements == 0)
-         num_vbuffers = 0;
-   }
-   else {
-      /*printf("Draw non-interleaved\n");*/
-      setup_non_interleaved_attribs(ctx, vp, vpv, arrays, max_index,
-                                    &userSpace, vbuffer, velements);
-      num_vbuffers = vpv->num_inputs;
-      num_velements = vpv->num_inputs;
-   }
-
-#if 0
-   {
-      GLuint i;
-      for (i = 0; i < num_vbuffers; i++) {
-         printf("buffers[%d].stride = %u\n", i, vbuffer[i].stride);
-         printf("buffers[%d].max_index = %u\n", i, vbuffer[i].max_index);
-         printf("buffers[%d].buffer_offset = %u\n", i, vbuffer[i].buffer_offset);
-         printf("buffers[%d].buffer = %p\n", i, (void*) vbuffer[i].buffer);
-      }
-      for (i = 0; i < num_velements; i++) {
-         printf("vlements[%d].vbuffer_index = %u\n", i, velements[i].vertex_buffer_index);
-         printf("vlements[%d].src_offset = %u\n", i, velements[i].src_offset);
-         printf("vlements[%d].format = %s\n", i, util_format_name(velements[i].src_format));
-      }
-   }
-#endif
-
-   pipe->set_vertex_buffers(pipe, num_vbuffers, vbuffer);
-   cso_set_vertex_elements(st->cso_context, num_velements, velements);
-
-   setup_index_buffer(ctx, ib, &ibuffer);
-   pipe->set_index_buffer(pipe, &ibuffer);
-
-   util_draw_init_info(&info);
-   if (ib) {
-      info.indexed = TRUE;
-      if (min_index != ~0 && max_index != ~0) {
-         info.min_index = min_index;
-         info.max_index = max_index;
-      }
-   }
-
-   info.primitive_restart = st->ctx->Array.PrimitiveRestart;
-   info.restart_index = st->ctx->Array.RestartIndex;
-
-   /* do actual drawing */
-   for (i = 0; i < nr_prims; i++) {
-      info.mode = translate_prim( ctx, prims[i].mode );
-      info.start = prims[i].start;
-      info.count = prims[i].count;
-      info.instance_count = prims[i].num_instances;
-      info.index_bias = prims[i].basevertex;
-      if (!ib) {
-         info.min_index = info.start;
-         info.max_index = info.start + info.count - 1;
-      }
-
-      if (u_trim_pipe_prim(info.mode, &info.count))
-         pipe->draw_vbo(pipe, &info);
-   }
-
-   pipe_resource_reference(&ibuffer.buffer, NULL);
-
-   /* unreference buffers (frees wrapped user-space buffer objects) */
-   for (attr = 0; attr < num_vbuffers; attr++) {
-      pipe_resource_reference(&vbuffer[attr].buffer, NULL);
-      assert(!vbuffer[attr].buffer);
-   }
-
-   if (userSpace) 
-   {
-      pipe->set_vertex_buffers(pipe, 0, NULL);
-   }
-}
-
-
-void st_init_draw( struct st_context *st )
-{
-   struct gl_context *ctx = st->ctx;
-
-   vbo_set_draw_func(ctx, st_draw_vbo);
-
-#if FEATURE_feedback || FEATURE_rastpos
-   st->draw = draw_create(st->pipe); /* for selection/feedback */
-
-   /* Disable draw options that might convert points/lines to tris, etc.
-    * as that would foul-up feedback/selection mode.
-    */
-   draw_wide_line_threshold(st->draw, 1000.0f);
-   draw_wide_point_threshold(st->draw, 1000.0f);
-   draw_enable_line_stipple(st->draw, FALSE);
-   draw_enable_point_sprites(st->draw, FALSE);
-#endif
-}
-
-
-void st_destroy_draw( struct st_context *st )
-{
-#if FEATURE_feedback || FEATURE_rastpos
-   draw_destroy(st->draw);
-#endif
-}
-
-
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/*
+ * This file implements the st_draw_vbo() function which is called from
+ * Mesa's VBO module.  All point/line/triangle rendering is done through
+ * this function whether the user called glBegin/End, glDrawArrays,
+ * glDrawElements, glEvalMesh, or glCalList, etc.
+ *
+ * We basically convert the VBO's vertex attribute/array information into
+ * Gallium vertex state, bind the vertex buffer objects and call
+ * pipe->draw_elements(), pipe->draw_range_elements() or pipe->draw_arrays().
+ *
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "main/imports.h"
+#include "main/image.h"
+#include "main/macros.h"
+#include "main/mfeatures.h"
+#include "program/prog_uniform.h"
+
+#include "vbo/vbo.h"
+
+#include "st_context.h"
+#include "st_atom.h"
+#include "st_cb_bufferobjects.h"
+#include "st_draw.h"
+#include "st_program.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_prim.h"
+#include "util/u_draw_quad.h"
+#include "draw/draw_context.h"
+#include "cso_cache/cso_context.h"
+
+
+static GLuint double_types[4] = {
+   PIPE_FORMAT_R64_FLOAT,
+   PIPE_FORMAT_R64G64_FLOAT,
+   PIPE_FORMAT_R64G64B64_FLOAT,
+   PIPE_FORMAT_R64G64B64A64_FLOAT
+};
+
+static GLuint float_types[4] = {
+   PIPE_FORMAT_R32_FLOAT,
+   PIPE_FORMAT_R32G32_FLOAT,
+   PIPE_FORMAT_R32G32B32_FLOAT,
+   PIPE_FORMAT_R32G32B32A32_FLOAT
+};
+
+static GLuint half_float_types[4] = {
+   PIPE_FORMAT_R16_FLOAT,
+   PIPE_FORMAT_R16G16_FLOAT,
+   PIPE_FORMAT_R16G16B16_FLOAT,
+   PIPE_FORMAT_R16G16B16A16_FLOAT
+};
+
+static GLuint uint_types_norm[4] = {
+   PIPE_FORMAT_R32_UNORM,
+   PIPE_FORMAT_R32G32_UNORM,
+   PIPE_FORMAT_R32G32B32_UNORM,
+   PIPE_FORMAT_R32G32B32A32_UNORM
+};
+
+static GLuint uint_types_scale[4] = {
+   PIPE_FORMAT_R32_USCALED,
+   PIPE_FORMAT_R32G32_USCALED,
+   PIPE_FORMAT_R32G32B32_USCALED,
+   PIPE_FORMAT_R32G32B32A32_USCALED
+};
+
+static GLuint int_types_norm[4] = {
+   PIPE_FORMAT_R32_SNORM,
+   PIPE_FORMAT_R32G32_SNORM,
+   PIPE_FORMAT_R32G32B32_SNORM,
+   PIPE_FORMAT_R32G32B32A32_SNORM
+};
+
+static GLuint int_types_scale[4] = {
+   PIPE_FORMAT_R32_SSCALED,
+   PIPE_FORMAT_R32G32_SSCALED,
+   PIPE_FORMAT_R32G32B32_SSCALED,
+   PIPE_FORMAT_R32G32B32A32_SSCALED
+};
+
+static GLuint ushort_types_norm[4] = {
+   PIPE_FORMAT_R16_UNORM,
+   PIPE_FORMAT_R16G16_UNORM,
+   PIPE_FORMAT_R16G16B16_UNORM,
+   PIPE_FORMAT_R16G16B16A16_UNORM
+};
+
+static GLuint ushort_types_scale[4] = {
+   PIPE_FORMAT_R16_USCALED,
+   PIPE_FORMAT_R16G16_USCALED,
+   PIPE_FORMAT_R16G16B16_USCALED,
+   PIPE_FORMAT_R16G16B16A16_USCALED
+};
+
+static GLuint short_types_norm[4] = {
+   PIPE_FORMAT_R16_SNORM,
+   PIPE_FORMAT_R16G16_SNORM,
+   PIPE_FORMAT_R16G16B16_SNORM,
+   PIPE_FORMAT_R16G16B16A16_SNORM
+};
+
+static GLuint short_types_scale[4] = {
+   PIPE_FORMAT_R16_SSCALED,
+   PIPE_FORMAT_R16G16_SSCALED,
+   PIPE_FORMAT_R16G16B16_SSCALED,
+   PIPE_FORMAT_R16G16B16A16_SSCALED
+};
+
+static GLuint ubyte_types_norm[4] = {
+   PIPE_FORMAT_R8_UNORM,
+   PIPE_FORMAT_R8G8_UNORM,
+   PIPE_FORMAT_R8G8B8_UNORM,
+   PIPE_FORMAT_R8G8B8A8_UNORM
+};
+
+static GLuint ubyte_types_scale[4] = {
+   PIPE_FORMAT_R8_USCALED,
+   PIPE_FORMAT_R8G8_USCALED,
+   PIPE_FORMAT_R8G8B8_USCALED,
+   PIPE_FORMAT_R8G8B8A8_USCALED
+};
+
+static GLuint byte_types_norm[4] = {
+   PIPE_FORMAT_R8_SNORM,
+   PIPE_FORMAT_R8G8_SNORM,
+   PIPE_FORMAT_R8G8B8_SNORM,
+   PIPE_FORMAT_R8G8B8A8_SNORM
+};
+
+static GLuint byte_types_scale[4] = {
+   PIPE_FORMAT_R8_SSCALED,
+   PIPE_FORMAT_R8G8_SSCALED,
+   PIPE_FORMAT_R8G8B8_SSCALED,
+   PIPE_FORMAT_R8G8B8A8_SSCALED
+};
+
+static GLuint fixed_types[4] = {
+   PIPE_FORMAT_R32_FIXED,
+   PIPE_FORMAT_R32G32_FIXED,
+   PIPE_FORMAT_R32G32B32_FIXED,
+   PIPE_FORMAT_R32G32B32A32_FIXED
+};
+
+
+
+/**
+ * Return a PIPE_FORMAT_x for the given GL datatype and size.
+ */
+GLuint
+st_pipe_vertex_format(GLenum type, GLuint size, GLenum format,
+                      GLboolean normalized)
+{
+   assert((type >= GL_BYTE && type <= GL_DOUBLE) ||
+          type == GL_FIXED || type == GL_HALF_FLOAT);
+   assert(size >= 1);
+   assert(size <= 4);
+   assert(format == GL_RGBA || format == GL_BGRA);
+
+   if (format == GL_BGRA) {
+      /* this is an odd-ball case */
+      assert(type == GL_UNSIGNED_BYTE);
+      assert(normalized);
+      return PIPE_FORMAT_B8G8R8A8_UNORM;
+   }
+
+   if (normalized) {
+      switch (type) {
+      case GL_DOUBLE: return double_types[size-1];
+      case GL_FLOAT: return float_types[size-1];
+      case GL_HALF_FLOAT: return half_float_types[size-1];
+      case GL_INT: return int_types_norm[size-1];
+      case GL_SHORT: return short_types_norm[size-1];
+      case GL_BYTE: return byte_types_norm[size-1];
+      case GL_UNSIGNED_INT: return uint_types_norm[size-1];
+      case GL_UNSIGNED_SHORT: return ushort_types_norm[size-1];
+      case GL_UNSIGNED_BYTE: return ubyte_types_norm[size-1];
+      case GL_FIXED: return fixed_types[size-1];
+      default: assert(0); return 0;
+      }      
+   }
+   else {
+      switch (type) {
+      case GL_DOUBLE: return double_types[size-1];
+      case GL_FLOAT: return float_types[size-1];
+      case GL_HALF_FLOAT: return half_float_types[size-1];
+      case GL_INT: return int_types_scale[size-1];
+      case GL_SHORT: return short_types_scale[size-1];
+      case GL_BYTE: return byte_types_scale[size-1];
+      case GL_UNSIGNED_INT: return uint_types_scale[size-1];
+      case GL_UNSIGNED_SHORT: return ushort_types_scale[size-1];
+      case GL_UNSIGNED_BYTE: return ubyte_types_scale[size-1];
+      case GL_FIXED: return fixed_types[size-1];
+      default: assert(0); return 0;
+      }      
+   }
+   return 0; /* silence compiler warning */
+}
+
+
+
+
+
+/**
+ * Examine the active arrays to determine if we have interleaved
+ * vertex arrays all living in one VBO, or all living in user space.
+ * \param userSpace  returns whether the arrays are in user space.
+ */
+static GLboolean
+is_interleaved_arrays(const struct st_vertex_program *vp,
+                      const struct st_vp_variant *vpv,
+                      const struct gl_client_array **arrays)
+{
+   GLuint attr;
+   const struct gl_buffer_object *firstBufObj = NULL;
+   GLint firstStride = -1;
+   const GLubyte *client_addr = NULL;
+
+   for (attr = 0; attr < vpv->num_inputs; attr++) {
+      const GLuint mesaAttr = vp->index_to_input[attr];
+      const struct gl_buffer_object *bufObj = arrays[mesaAttr]->BufferObj;
+      const GLsizei stride = arrays[mesaAttr]->StrideB; /* in bytes */
+
+      if (firstStride < 0) {
+         firstStride = stride;
+      }
+      else if (firstStride != stride) {
+         return GL_FALSE;
+      }
+
+      if (!bufObj || !bufObj->Name) {
+         /* Try to detect if the client-space arrays are
+          * "close" to each other.
+          */
+         if (!client_addr) {
+            client_addr = arrays[mesaAttr]->Ptr;
+         }
+         else if (abs(arrays[mesaAttr]->Ptr - client_addr) > firstStride) {
+            /* arrays start too far apart */
+            return GL_FALSE;
+         }
+      }
+      else if (!firstBufObj) {
+         firstBufObj = bufObj;
+      }
+      else if (bufObj != firstBufObj) {
+         return GL_FALSE;
+      }
+   }
+
+   return GL_TRUE;
+}
+
+
+/**
+ * Set up for drawing interleaved arrays that all live in one VBO
+ * or all live in user space.
+ * \param vbuffer  returns vertex buffer info
+ * \param velements  returns vertex element info
+ */
+static void
+setup_interleaved_attribs(struct gl_context *ctx,
+                          const struct st_vertex_program *vp,
+                          const struct st_vp_variant *vpv,
+                          const struct gl_client_array **arrays,
+                          struct pipe_vertex_buffer *vbuffer,
+                          struct pipe_vertex_element velements[],
+                          unsigned max_index)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+   GLuint attr;
+   const GLubyte *low_addr = NULL;
+
+   /* Find the lowest address. */
+   for (attr = 0; attr < vpv->num_inputs; attr++) {
+      const GLubyte *start = arrays[vp->index_to_input[attr]]->Ptr;
+
+      low_addr = !low_addr ? start : MIN2(low_addr, start);
+   }
+
+   for (attr = 0; attr < vpv->num_inputs; attr++) {
+      const GLuint mesaAttr = vp->index_to_input[attr];
+      struct gl_buffer_object *bufobj = arrays[mesaAttr]->BufferObj;
+      struct st_buffer_object *stobj = st_buffer_object(bufobj);
+      GLsizei stride = arrays[mesaAttr]->StrideB;
+
+      if (attr == 0) {
+         if (bufobj && bufobj->Name) {
+            vbuffer->buffer = NULL;
+            pipe_resource_reference(&vbuffer->buffer, stobj->buffer);
+            vbuffer->buffer_offset = pointer_to_offset(low_addr);
+         } else {
+            vbuffer->buffer =
+               pipe_user_buffer_create(pipe->screen, (void*)low_addr,
+                                       stride * (max_index + 1),
+				       PIPE_BIND_VERTEX_BUFFER);
+            vbuffer->buffer_offset = 0;
+
+            /* Track user vertex buffers. */
+            pipe_resource_reference(&st->user_vb[0], vbuffer->buffer);
+            st->user_vb_stride[0] = stride;
+            st->num_user_vbs = 1;
+         }
+         vbuffer->stride = stride; /* in bytes */
+      }
+
+      velements[attr].src_offset =
+         (unsigned) (arrays[mesaAttr]->Ptr - low_addr);
+      velements[attr].instance_divisor = arrays[mesaAttr]->InstanceDivisor;
+      velements[attr].vertex_buffer_index = 0;
+      velements[attr].src_format =
+         st_pipe_vertex_format(arrays[mesaAttr]->Type,
+                               arrays[mesaAttr]->Size,
+                               arrays[mesaAttr]->Format,
+                               arrays[mesaAttr]->Normalized);
+      assert(velements[attr].src_format);
+   }
+}
+
+
+/**
+ * Set up a separate pipe_vertex_buffer and pipe_vertex_element for each
+ * vertex attribute.
+ * \param vbuffer  returns vertex buffer info
+ * \param velements  returns vertex element info
+ */
+static void
+setup_non_interleaved_attribs(struct gl_context *ctx,
+                              const struct st_vertex_program *vp,
+                              const struct st_vp_variant *vpv,
+                              const struct gl_client_array **arrays,
+                              struct pipe_vertex_buffer vbuffer[],
+                              struct pipe_vertex_element velements[],
+                              unsigned max_index)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+   GLuint attr;
+
+   for (attr = 0; attr < vpv->num_inputs; attr++) {
+      const GLuint mesaAttr = vp->index_to_input[attr];
+      struct gl_buffer_object *bufobj = arrays[mesaAttr]->BufferObj;
+      GLsizei stride = arrays[mesaAttr]->StrideB;
+
+      if (bufobj && bufobj->Name) {
+         /* Attribute data is in a VBO.
+          * Recall that for VBOs, the gl_client_array->Ptr field is
+          * really an offset from the start of the VBO, not a pointer.
+          */
+         struct st_buffer_object *stobj = st_buffer_object(bufobj);
+         assert(stobj->buffer);
+
+         vbuffer[attr].buffer = NULL;
+         pipe_resource_reference(&vbuffer[attr].buffer, stobj->buffer);
+         vbuffer[attr].buffer_offset = pointer_to_offset(arrays[mesaAttr]->Ptr);
+      }
+      else {
+         /* wrap user data */
+         if (arrays[mesaAttr]->Ptr) {
+            vbuffer[attr].buffer = 
+	       pipe_user_buffer_create(pipe->screen,
+				       (void *) arrays[mesaAttr]->Ptr,
+				       stride * (max_index + 1),
+				       PIPE_BIND_VERTEX_BUFFER);
+         }
+         else {
+            /* no array, use ctx->Current.Attrib[] value */
+            uint bytes = sizeof(ctx->Current.Attrib[0]);
+            vbuffer[attr].buffer = 
+	       pipe_user_buffer_create(pipe->screen,
+				       (void *) ctx->Current.Attrib[mesaAttr],
+				       bytes,
+				       PIPE_BIND_VERTEX_BUFFER);
+            stride = 0;
+         }
+
+         vbuffer[attr].buffer_offset = 0;
+
+         /* Track user vertex buffers. */
+         pipe_resource_reference(&st->user_vb[attr], vbuffer->buffer);
+         st->user_vb_stride[attr] = stride;
+         st->num_user_vbs = MAX2(st->num_user_vbs, attr+1);
+      }
+
+      /* common-case setup */
+      vbuffer[attr].stride = stride; /* in bytes */
+
+      velements[attr].src_offset = 0;
+      velements[attr].instance_divisor = arrays[mesaAttr]->InstanceDivisor;
+      velements[attr].vertex_buffer_index = attr;
+      velements[attr].src_format
+         = st_pipe_vertex_format(arrays[mesaAttr]->Type,
+                                 arrays[mesaAttr]->Size,
+                                 arrays[mesaAttr]->Format,
+                                 arrays[mesaAttr]->Normalized);
+      assert(velements[attr].src_format);
+   }
+}
+
+
+static void
+setup_index_buffer(struct gl_context *ctx,
+                   const struct _mesa_index_buffer *ib,
+                   struct pipe_index_buffer *ibuffer)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+
+   memset(ibuffer, 0, sizeof(*ibuffer));
+   if (ib) {
+      struct gl_buffer_object *bufobj = ib->obj;
+
+      switch (ib->type) {
+      case GL_UNSIGNED_INT:
+         ibuffer->index_size = 4;
+         break;
+      case GL_UNSIGNED_SHORT:
+         ibuffer->index_size = 2;
+         break;
+      case GL_UNSIGNED_BYTE:
+         ibuffer->index_size = 1;
+         break;
+      default:
+         assert(0);
+	 return;
+      }
+
+      /* get/create the index buffer object */
+      if (bufobj && bufobj->Name) {
+         /* elements/indexes are in a real VBO */
+         struct st_buffer_object *stobj = st_buffer_object(bufobj);
+         pipe_resource_reference(&ibuffer->buffer, stobj->buffer);
+         ibuffer->offset = pointer_to_offset(ib->ptr);
+      }
+      else {
+         /* element/indicies are in user space memory */
+         ibuffer->buffer =
+            pipe_user_buffer_create(pipe->screen, (void *) ib->ptr,
+                                    ib->count * ibuffer->index_size,
+                                    PIPE_BIND_INDEX_BUFFER);
+      }
+   }
+}
+
+/**
+ * Prior to drawing, check that any uniforms referenced by the
+ * current shader have been set.  If a uniform has not been set,
+ * issue a warning.
+ */
+static void
+check_uniforms(struct gl_context *ctx)
+{
+   struct gl_shader_program *shProg[3] = {
+      ctx->Shader.CurrentVertexProgram,
+      ctx->Shader.CurrentGeometryProgram,
+      ctx->Shader.CurrentFragmentProgram,
+   };
+   unsigned j;
+
+   for (j = 0; j < 3; j++) {
+      unsigned i;
+
+      if (shProg[j] == NULL || !shProg[j]->LinkStatus)
+	 continue;
+
+      for (i = 0; i < shProg[j]->Uniforms->NumUniforms; i++) {
+         const struct gl_uniform *u = &shProg[j]->Uniforms->Uniforms[i];
+         if (!u->Initialized) {
+            _mesa_warning(ctx,
+                          "Using shader with uninitialized uniform: %s",
+                          u->Name);
+         }
+      }
+   }
+}
+
+
+/**
+ * Translate OpenGL primtive type (GL_POINTS, GL_TRIANGLE_STRIP, etc) to
+ * the corresponding Gallium type.
+ */
+static unsigned
+translate_prim(const struct gl_context *ctx, unsigned prim)
+{
+   /* GL prims should match Gallium prims, spot-check a few */
+   assert(GL_POINTS == PIPE_PRIM_POINTS);
+   assert(GL_QUADS == PIPE_PRIM_QUADS);
+   assert(GL_TRIANGLE_STRIP_ADJACENCY == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY);
+
+   /* Avoid quadstrips if it's easy to do so:
+    * Note: it's imporant to do the correct trimming if we change the prim type!
+    * We do that wherever this function is called.
+    */
+   if (prim == GL_QUAD_STRIP &&
+       ctx->Light.ShadeModel != GL_FLAT &&
+       ctx->Polygon.FrontMode == GL_FILL &&
+       ctx->Polygon.BackMode == GL_FILL)
+      prim = GL_TRIANGLE_STRIP;
+
+   return prim;
+}
+
+
+static void
+st_validate_varrays(struct gl_context *ctx,
+                    const struct gl_client_array **arrays,
+                    unsigned max_index)
+{
+   struct st_context *st = st_context(ctx);
+   const struct st_vertex_program *vp;
+   const struct st_vp_variant *vpv;
+   struct pipe_vertex_buffer vbuffer[PIPE_MAX_SHADER_INPUTS];
+   struct pipe_vertex_element velements[PIPE_MAX_ATTRIBS];
+   unsigned num_vbuffers, num_velements;
+   GLuint attr;
+   unsigned i;
+
+   /* must get these after state validation! */
+   vp = st->vp;
+   vpv = st->vp_variant;
+
+   memset(velements, 0, sizeof(struct pipe_vertex_element) * vpv->num_inputs);
+
+   /* Unreference any user vertex buffers. */
+   for (i = 0; i < st->num_user_vbs; i++) {
+      pipe_resource_reference(&st->user_vb[i], NULL);
+   }
+   st->num_user_vbs = 0;
+
+   /*
+    * Setup the vbuffer[] and velements[] arrays.
+    */
+   if (is_interleaved_arrays(vp, vpv, arrays)) {
+      setup_interleaved_attribs(ctx, vp, vpv, arrays, vbuffer, velements,
+                                max_index);
+      num_vbuffers = 1;
+      num_velements = vpv->num_inputs;
+      if (num_velements == 0)
+         num_vbuffers = 0;
+   }
+   else {
+      setup_non_interleaved_attribs(ctx, vp, vpv, arrays,
+                                    vbuffer, velements, max_index);
+      num_vbuffers = vpv->num_inputs;
+      num_velements = vpv->num_inputs;
+   }
+
+   cso_set_vertex_buffers(st->cso_context, num_vbuffers, vbuffer);
+   cso_set_vertex_elements(st->cso_context, num_velements, velements);
+
+   /* unreference buffers (frees wrapped user-space buffer objects)
+    * This is OK, because the pipe driver should reference buffers by itself
+    * in set_vertex_buffers. */
+   for (attr = 0; attr < num_vbuffers; attr++) {
+      pipe_resource_reference(&vbuffer[attr].buffer, NULL);
+      assert(!vbuffer[attr].buffer);
+   }
+}
+
+
+/**
+ * This function gets plugged into the VBO module and is called when
+ * we have something to render.
+ * Basically, translate the information into the format expected by gallium.
+ */
+void
+st_draw_vbo(struct gl_context *ctx,
+            const struct gl_client_array **arrays,
+            const struct _mesa_prim *prims,
+            GLuint nr_prims,
+            const struct _mesa_index_buffer *ib,
+	    GLboolean index_bounds_valid,
+            GLuint min_index,
+            GLuint max_index)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_index_buffer ibuffer;
+   struct pipe_draw_info info;
+   unsigned i;
+   GLboolean new_array =
+         st->dirty.st && (st->dirty.mesa & (_NEW_ARRAY | _NEW_PROGRAM)) != 0;
+
+   /* Mesa core state should have been validated already */
+   assert(ctx->NewState == 0x0);
+
+   if (ib) {
+      /* Gallium probably doesn't want this in some cases. */
+      if (!index_bounds_valid)
+         if (!vbo_all_varyings_in_vbos(arrays))
+            vbo_get_minmax_index(ctx, prims, ib, &min_index, &max_index);
+   } else {
+      /* Get min/max index for non-indexed drawing. */
+      min_index = ~0;
+      max_index = 0;
+
+      for (i = 0; i < nr_prims; i++) {
+         min_index = MIN2(min_index, prims[i].start);
+         max_index = MAX2(max_index, prims[i].start + prims[i].count - 1);
+      }
+   }
+
+   /* Validate state. */
+   if (st->dirty.st) {
+      GLboolean vertDataEdgeFlags;
+
+      /* sanity check for pointer arithmetic below */
+      assert(sizeof(arrays[0]->Ptr[0]) == 1);
+
+      vertDataEdgeFlags = arrays[VERT_ATTRIB_EDGEFLAG]->BufferObj &&
+                          arrays[VERT_ATTRIB_EDGEFLAG]->BufferObj->Name;
+      if (vertDataEdgeFlags != st->vertdata_edgeflags) {
+         st->vertdata_edgeflags = vertDataEdgeFlags;
+         st->dirty.st |= ST_NEW_EDGEFLAGS_DATA;
+      }
+
+      st_validate_state(st);
+
+      if (new_array) {
+         st_validate_varrays(ctx, arrays, max_index);
+      }
+
+#if 0
+      if (MESA_VERBOSE & VERBOSE_GLSL) {
+         check_uniforms(ctx);
+      }
+#else
+      (void) check_uniforms;
+#endif
+   }
+
+   /* Notify the driver that the content of user buffers may have been
+    * changed. */
+   if (!new_array && st->num_user_vbs) {
+      for (i = 0; i < st->num_user_vbs; i++) {
+         if (st->user_vb[i]) {
+            unsigned stride = st->user_vb_stride[i];
+
+            if (stride) {
+               pipe->redefine_user_buffer(pipe, st->user_vb[i],
+                                          min_index * stride,
+                                          (max_index + 1 - min_index) * stride);
+            } else {
+               /* stride == 0 */
+               pipe->redefine_user_buffer(pipe, st->user_vb[i],
+                                          0, st->user_vb[i]->width0);
+            }
+         }
+      }
+   }
+
+   setup_index_buffer(ctx, ib, &ibuffer);
+   pipe->set_index_buffer(pipe, &ibuffer);
+
+   util_draw_init_info(&info);
+   if (ib) {
+      info.indexed = TRUE;
+      if (min_index != ~0 && max_index != ~0) {
+         info.min_index = min_index;
+         info.max_index = max_index;
+      }
+   }
+
+   info.primitive_restart = st->ctx->Array.PrimitiveRestart;
+   info.restart_index = st->ctx->Array.RestartIndex;
+
+   /* do actual drawing */
+   for (i = 0; i < nr_prims; i++) {
+      info.mode = translate_prim( ctx, prims[i].mode );
+      info.start = prims[i].start;
+      info.count = prims[i].count;
+      info.instance_count = prims[i].num_instances;
+      info.index_bias = prims[i].basevertex;
+      if (!ib) {
+         info.min_index = info.start;
+         info.max_index = info.start + info.count - 1;
+      }
+
+      if (u_trim_pipe_prim(info.mode, &info.count))
+         pipe->draw_vbo(pipe, &info);
+   }
+
+   pipe_resource_reference(&ibuffer.buffer, NULL);
+}
+
+
+void st_init_draw( struct st_context *st )
+{
+   struct gl_context *ctx = st->ctx;
+
+   vbo_set_draw_func(ctx, st_draw_vbo);
+
+#if FEATURE_feedback || FEATURE_rastpos
+   st->draw = draw_create(st->pipe); /* for selection/feedback */
+
+   /* Disable draw options that might convert points/lines to tris, etc.
+    * as that would foul-up feedback/selection mode.
+    */
+   draw_wide_line_threshold(st->draw, 1000.0f);
+   draw_wide_point_threshold(st->draw, 1000.0f);
+   draw_enable_line_stipple(st->draw, FALSE);
+   draw_enable_point_sprites(st->draw, FALSE);
+#endif
+}
+
+
+void st_destroy_draw( struct st_context *st )
+{
+#if FEATURE_feedback || FEATURE_rastpos
+   draw_destroy(st->draw);
+#endif
+}
+
+
diff --git a/mesalib/src/mesa/state_tracker/st_draw_feedback.c b/mesalib/src/mesa/state_tracker/st_draw_feedback.c
index 545b32d75..1e1220bfe 100644
--- a/mesalib/src/mesa/state_tracker/st_draw_feedback.c
+++ b/mesalib/src/mesa/state_tracker/st_draw_feedback.c
@@ -179,7 +179,6 @@ st_feedback_draw_vbo(struct gl_context *ctx,
 
       /* common-case setup */
       vbuffers[attr].stride = arrays[mesaAttr]->StrideB; /* in bytes */
-      vbuffers[attr].max_index = max_index;
       velements[attr].instance_divisor = 0;
       velements[attr].vertex_buffer_index = attr;
       velements[attr].src_format = 
diff --git a/mesalib/src/mesa/state_tracker/st_gen_mipmap.c b/mesalib/src/mesa/state_tracker/st_gen_mipmap.c
index 0be66a2c2..18eb3be68 100644
--- a/mesalib/src/mesa/state_tracker/st_gen_mipmap.c
+++ b/mesalib/src/mesa/state_tracker/st_gen_mipmap.c
@@ -1,424 +1,422 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#include "main/imports.h"
-#include "main/mipmap.h"
-#include "main/teximage.h"
-
-#include "pipe/p_context.h"
-#include "pipe/p_defines.h"
-#include "util/u_inlines.h"
-#include "util/u_format.h"
-#include "util/u_gen_mipmap.h"
-
-#include "st_debug.h"
-#include "st_context.h"
-#include "st_texture.h"
-#include "st_gen_mipmap.h"
-#include "st_cb_texture.h"
-
-
-/**
- * one-time init for generate mipmap
- * XXX Note: there may be other times we need no-op/simple state like this.
- * In that case, some code refactoring would be good.
- */
-void
-st_init_generate_mipmap(struct st_context *st)
-{
-   st->gen_mipmap = util_create_gen_mipmap(st->pipe, st->cso_context);
-}
-
-
-void
-st_destroy_generate_mipmap(struct st_context *st)
-{
-   util_destroy_gen_mipmap(st->gen_mipmap);
-   st->gen_mipmap = NULL;
-}
-
-
-/**
- * Generate mipmap levels using hardware rendering.
- * \return TRUE if successful, FALSE if not possible
- */
-static boolean
-st_render_mipmap(struct st_context *st,
-                 GLenum target,
-                 struct st_texture_object *stObj,
-                 uint baseLevel, uint lastLevel)
-{
-   struct pipe_context *pipe = st->pipe;
-   struct pipe_screen *screen = pipe->screen;
-   struct pipe_sampler_view *psv = st_get_texture_sampler_view(stObj, pipe);
-   const uint face = _mesa_tex_target_to_face(target);
-
-   assert(psv->texture == stObj->pt);
-#if 0
-   assert(target != GL_TEXTURE_3D); /* implemented but untested */
-#endif
-
-   /* check if we can render in the texture's format */
-   /* XXX should probably kill this and always use util_gen_mipmap
-      since this implements a sw fallback as well */
-   if (!screen->is_format_supported(screen, psv->format, psv->texture->target,
-                                    0, PIPE_BIND_RENDER_TARGET, 0)) {
-      return FALSE;
-   }
-
-   util_gen_mipmap(st->gen_mipmap, psv, face, baseLevel, lastLevel,
-                   PIPE_TEX_FILTER_LINEAR);
-
-   return TRUE;
-}
-
-
-/**
- * Helper function to decompress an image.  The result is a 32-bpp RGBA
- * image with stride==width.
- */
-static void
-decompress_image(enum pipe_format format,
-                 const uint8_t *src, uint8_t *dst,
-                 unsigned width, unsigned height)
-{
-   const struct util_format_description *desc = util_format_description(format);
-   const uint bw = util_format_get_blockwidth(format);
-   const uint bh = util_format_get_blockheight(format);
-   const uint dst_stride = 4 * MAX2(width, bw);
-   const uint src_stride = util_format_get_stride(format, width);
-
-   desc->unpack_rgba_8unorm(dst, dst_stride, src, src_stride, width, height);
-
-   if (width < bw || height < bh) {
-      /* We're decompressing an image smaller than the compression
-       * block size.  We don't want garbage pixel values in the region
-       * outside (width x height) so replicate pixels from the (width
-       * x height) region to fill out the (bw x bh) block size.
-       */
-      uint x, y;
-      for (y = 0; y < bh; y++) {
-         for (x = 0; x < bw; x++) {
-            if (x >= width || y >= height) {
-               uint p = (y * bw + x) * 4;
-               dst[p + 0] = dst[0];
-               dst[p + 1] = dst[1];
-               dst[p + 2] = dst[2];
-               dst[p + 3] = dst[3];
-            }
-         }
-      }
-   }
-}
-
-
-/**
- * Helper function to compress an image.  The source is a 32-bpp RGBA image
- * with stride==width.
- */
-static void
-compress_image(enum pipe_format format,
-               const uint8_t *src, uint8_t *dst,
-               unsigned width, unsigned height)
-{
-   const struct util_format_description *desc = util_format_description(format);
-   const uint dst_stride = util_format_get_stride(format, width);
-   const uint src_stride = 4 * width;
-
-   desc->pack_rgba_8unorm(dst, dst_stride, src, src_stride, width, height);
-}
-
-
-/**
- * Software fallback for generate mipmap levels.
- */
-static void
-fallback_generate_mipmap(struct gl_context *ctx, GLenum target,
-                         struct gl_texture_object *texObj)
-{
-   struct pipe_context *pipe = st_context(ctx)->pipe;
-   struct pipe_resource *pt = st_get_texobj_resource(texObj);
-   const uint baseLevel = texObj->BaseLevel;
-   const uint lastLevel = pt->last_level;
-   const uint face = _mesa_tex_target_to_face(target);
-   uint dstLevel;
-   GLenum datatype;
-   GLuint comps;
-   GLboolean compressed;
-
-   if (ST_DEBUG & DEBUG_FALLBACK)
-      debug_printf("%s: fallback processing\n", __FUNCTION__);
-
-   assert(target != GL_TEXTURE_3D); /* not done yet */
-
-   compressed =
-      _mesa_is_format_compressed(texObj->Image[face][baseLevel]->TexFormat);
-
-   if (compressed) {
-      datatype = GL_UNSIGNED_BYTE;
-      comps = 4;
-   }
-   else {
-      _mesa_format_to_type_and_comps(texObj->Image[face][baseLevel]->TexFormat,
-                                     &datatype, &comps);
-      assert(comps > 0 && "bad texture format in fallback_generate_mipmap()");
-   }
-
-   for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
-      const uint srcLevel = dstLevel - 1;
-      const uint srcWidth = u_minify(pt->width0, srcLevel);
-      const uint srcHeight = u_minify(pt->height0, srcLevel);
-      const uint srcDepth = u_minify(pt->depth0, srcLevel);
-      const uint dstWidth = u_minify(pt->width0, dstLevel);
-      const uint dstHeight = u_minify(pt->height0, dstLevel);
-      const uint dstDepth = u_minify(pt->depth0, dstLevel);
-      struct pipe_transfer *srcTrans, *dstTrans;
-      const ubyte *srcData;
-      ubyte *dstData;
-      int srcStride, dstStride;
-
-      srcTrans = pipe_get_transfer(st_context(ctx)->pipe, pt, srcLevel,
-                                   face,
-                                   PIPE_TRANSFER_READ, 0, 0,
-                                   srcWidth, srcHeight);
-
-      dstTrans = pipe_get_transfer(st_context(ctx)->pipe, pt, dstLevel,
-                                   face,
-                                   PIPE_TRANSFER_WRITE, 0, 0,
-                                   dstWidth, dstHeight);
-
-      srcData = (ubyte *) pipe_transfer_map(pipe, srcTrans);
-      dstData = (ubyte *) pipe_transfer_map(pipe, dstTrans);
-
-      srcStride = srcTrans->stride / util_format_get_blocksize(srcTrans->resource->format);
-      dstStride = dstTrans->stride / util_format_get_blocksize(dstTrans->resource->format);
-
-     /* this cannot work correctly for 3d since it does
-        not respect layerStride. */
-      if (compressed) {
-         const enum pipe_format format = pt->format;
-         const uint bw = util_format_get_blockwidth(format);
-         const uint bh = util_format_get_blockheight(format);
-         const uint srcWidth2 = align(srcWidth, bw);
-         const uint srcHeight2 = align(srcHeight, bh);
-         const uint dstWidth2 = align(dstWidth, bw);
-         const uint dstHeight2 = align(dstHeight, bh);
-         uint8_t *srcTemp, *dstTemp;
-
-         assert(comps == 4);
-
-         srcTemp = malloc(srcWidth2 * srcHeight2 * comps + 000);
-         dstTemp = malloc(dstWidth2 * dstHeight2 * comps + 000);
-
-         /* decompress the src image: srcData -> srcTemp */
-         decompress_image(format, srcData, srcTemp, srcWidth, srcHeight);
-
-         _mesa_generate_mipmap_level(target, datatype, comps,
-                                     0 /*border*/,
-                                     srcWidth2, srcHeight2, srcDepth,
-                                     srcTemp,
-                                     srcWidth2, /* stride in texels */
-                                     dstWidth2, dstHeight2, dstDepth,
-                                     dstTemp,
-                                     dstWidth2); /* stride in texels */
-
-         /* compress the new image: dstTemp -> dstData */
-         compress_image(format, dstTemp, dstData, dstWidth, dstHeight);
-
-         free(srcTemp);
-         free(dstTemp);
-      }
-      else {
-         _mesa_generate_mipmap_level(target, datatype, comps,
-                                     0 /*border*/,
-                                     srcWidth, srcHeight, srcDepth,
-                                     srcData,
-                                     srcStride, /* stride in texels */
-                                     dstWidth, dstHeight, dstDepth,
-                                     dstData,
-                                     dstStride); /* stride in texels */
-      }
-
-      pipe_transfer_unmap(pipe, srcTrans);
-      pipe_transfer_unmap(pipe, dstTrans);
-
-      pipe->transfer_destroy(pipe, srcTrans);
-      pipe->transfer_destroy(pipe, dstTrans);
-   }
-}
-
-
-/**
- * Compute the expected number of mipmap levels in the texture given
- * the width/height/depth of the base image and the GL_TEXTURE_BASE_LEVEL/
- * GL_TEXTURE_MAX_LEVEL settings.  This will tell us how many mipmap
- * levels should be generated.
- */
-static GLuint
-compute_num_levels(struct gl_context *ctx,
-                   struct gl_texture_object *texObj,
-                   GLenum target)
-{
-   if (target == GL_TEXTURE_RECTANGLE_ARB) {
-      return 1;
-   }
-   else {
-      const struct gl_texture_image *baseImage = 
-         _mesa_get_tex_image(ctx, texObj, target, texObj->BaseLevel);
-      GLuint size, numLevels;
-
-      size = MAX2(baseImage->Width2, baseImage->Height2);
-      size = MAX2(size, baseImage->Depth2);
-
-      numLevels = texObj->BaseLevel;
-
-      while (size > 0) {
-         numLevels++;
-         size >>= 1;
-      }
-
-      numLevels = MIN2(numLevels, texObj->MaxLevel + 1);
-
-      assert(numLevels >= 1);
-
-      return numLevels;
-   }
-}
-
-
-/**
- * Called via ctx->Driver.GenerateMipmap().
- */
-void
-st_generate_mipmap(struct gl_context *ctx, GLenum target,
-                   struct gl_texture_object *texObj)
-{
-   struct st_context *st = st_context(ctx);
-   struct st_texture_object *stObj = st_texture_object(texObj);
-   struct pipe_resource *pt = st_get_texobj_resource(texObj);
-   const uint baseLevel = texObj->BaseLevel;
-   uint lastLevel;
-   uint dstLevel;
-
-   if (!pt)
-      return;
-
-   /* not sure if this ultimately actually should work,
-      but we're not supporting multisampled textures yet. */
-   assert(pt->nr_samples < 2);
-
-   /* find expected last mipmap level to generate*/
-   lastLevel = compute_num_levels(ctx, texObj, target) - 1;
-
-   if (lastLevel == 0)
-      return;
-
-   if (pt->last_level < lastLevel) {
-      /* The current gallium texture doesn't have space for all the
-       * mipmap levels we need to generate.  So allocate a new texture.
-       */
-      struct pipe_resource *oldTex = stObj->pt;
-
-      /* create new texture with space for more levels */
-      stObj->pt = st_texture_create(st,
-                                    oldTex->target,
-                                    oldTex->format,
-                                    lastLevel,
-                                    oldTex->width0,
-                                    oldTex->height0,
-                                    oldTex->depth0,
-                                    oldTex->array_size,
-                                    oldTex->bind);
-
-      /* The texture isn't in a "complete" state yet so set the expected
-       * lastLevel here, since it won't get done in st_finalize_texture().
-       */
-      stObj->lastLevel = lastLevel;
-
-      /* This will copy the old texture's base image into the new texture
-       * which we just allocated.
-       */
-      st_finalize_texture(ctx, st->pipe, texObj);
-
-      /* release the old tex (will likely be freed too) */
-      pipe_resource_reference(&oldTex, NULL);
-      pipe_sampler_view_reference(&stObj->sampler_view, NULL);
-
-      pt = stObj->pt;
-   }
-   else {
-      /* Make sure that the base texture image data is present in the
-       * texture buffer.
-       */
-      st_finalize_texture(ctx, st->pipe, texObj);
-   }
-
-   assert(pt->last_level >= lastLevel);
-
-   /* Try to generate the mipmap by rendering/texturing.  If that fails,
-    * use the software fallback.
-    */
-   if (!st_render_mipmap(st, target, stObj, baseLevel, lastLevel)) {
-      /* since the util code actually also has a fallback, should
-         probably make it never fail and kill this */
-      fallback_generate_mipmap(ctx, target, texObj);
-   }
-
-   /* Fill in the Mesa gl_texture_image fields */
-   for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
-      const uint srcLevel = dstLevel - 1;
-      const struct gl_texture_image *srcImage
-         = _mesa_get_tex_image(ctx, texObj, target, srcLevel);
-      struct gl_texture_image *dstImage;
-      struct st_texture_image *stImage;
-      uint dstWidth = u_minify(pt->width0, dstLevel);
-      uint dstHeight = u_minify(pt->height0, dstLevel);
-      uint dstDepth = u_minify(pt->depth0, dstLevel); 
-      uint border = srcImage->Border;
-
-      dstImage = _mesa_get_tex_image(ctx, texObj, target, dstLevel);
-      if (!dstImage) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "generating mipmaps");
-         return;
-      }
-
-      /* Free old image data */
-      if (dstImage->Data)
-         ctx->Driver.FreeTexImageData(ctx, dstImage);
-
-      /* initialize new image */
-      _mesa_init_teximage_fields(ctx, target, dstImage, dstWidth, dstHeight,
-                                 dstDepth, border, srcImage->InternalFormat,
-                                 srcImage->TexFormat);
-
-      stImage = st_texture_image(dstImage);
-      stImage->level = dstLevel;
-
-      pipe_resource_reference(&stImage->pt, pt);
-   }
-}
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "main/imports.h"
+#include "main/mipmap.h"
+#include "main/teximage.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_gen_mipmap.h"
+
+#include "st_debug.h"
+#include "st_context.h"
+#include "st_texture.h"
+#include "st_gen_mipmap.h"
+#include "st_cb_texture.h"
+
+
+/**
+ * one-time init for generate mipmap
+ * XXX Note: there may be other times we need no-op/simple state like this.
+ * In that case, some code refactoring would be good.
+ */
+void
+st_init_generate_mipmap(struct st_context *st)
+{
+   st->gen_mipmap = util_create_gen_mipmap(st->pipe, st->cso_context);
+}
+
+
+void
+st_destroy_generate_mipmap(struct st_context *st)
+{
+   util_destroy_gen_mipmap(st->gen_mipmap);
+   st->gen_mipmap = NULL;
+}
+
+
+/**
+ * Generate mipmap levels using hardware rendering.
+ * \return TRUE if successful, FALSE if not possible
+ */
+static boolean
+st_render_mipmap(struct st_context *st,
+                 GLenum target,
+                 struct st_texture_object *stObj,
+                 uint baseLevel, uint lastLevel)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_sampler_view *psv = st_get_texture_sampler_view(stObj, pipe);
+   const uint face = _mesa_tex_target_to_face(target);
+
+   assert(psv->texture == stObj->pt);
+#if 0
+   assert(target != GL_TEXTURE_3D); /* implemented but untested */
+#endif
+
+   /* check if we can render in the texture's format */
+   /* XXX should probably kill this and always use util_gen_mipmap
+      since this implements a sw fallback as well */
+   if (!screen->is_format_supported(screen, psv->format, psv->texture->target,
+                                    0, PIPE_BIND_RENDER_TARGET, 0)) {
+      return FALSE;
+   }
+
+   util_gen_mipmap(st->gen_mipmap, psv, face, baseLevel, lastLevel,
+                   PIPE_TEX_FILTER_LINEAR);
+
+   return TRUE;
+}
+
+
+/**
+ * Helper function to decompress an image.  The result is a 32-bpp RGBA
+ * image with stride==width.
+ */
+static void
+decompress_image(enum pipe_format format,
+                 const uint8_t *src, uint8_t *dst,
+                 unsigned width, unsigned height, unsigned src_stride)
+{
+   const struct util_format_description *desc = util_format_description(format);
+   const uint bw = util_format_get_blockwidth(format);
+   const uint bh = util_format_get_blockheight(format);
+   const uint dst_stride = 4 * MAX2(width, bw);
+
+   desc->unpack_rgba_8unorm(dst, dst_stride, src, src_stride, width, height);
+
+   if (width < bw || height < bh) {
+      /* We're decompressing an image smaller than the compression
+       * block size.  We don't want garbage pixel values in the region
+       * outside (width x height) so replicate pixels from the (width
+       * x height) region to fill out the (bw x bh) block size.
+       */
+      uint x, y;
+      for (y = 0; y < bh; y++) {
+         for (x = 0; x < bw; x++) {
+            if (x >= width || y >= height) {
+               uint p = (y * bw + x) * 4;
+               dst[p + 0] = dst[0];
+               dst[p + 1] = dst[1];
+               dst[p + 2] = dst[2];
+               dst[p + 3] = dst[3];
+            }
+         }
+      }
+   }
+}
+
+
+/**
+ * Helper function to compress an image.  The source is a 32-bpp RGBA image
+ * with stride==width.
+ */
+static void
+compress_image(enum pipe_format format,
+               const uint8_t *src, uint8_t *dst,
+               unsigned width, unsigned height, unsigned dst_stride)
+{
+   const struct util_format_description *desc = util_format_description(format);
+   const uint src_stride = 4 * width;
+
+   desc->pack_rgba_8unorm(dst, dst_stride, src, src_stride, width, height);
+}
+
+
+/**
+ * Software fallback for generate mipmap levels.
+ */
+static void
+fallback_generate_mipmap(struct gl_context *ctx, GLenum target,
+                         struct gl_texture_object *texObj)
+{
+   struct pipe_context *pipe = st_context(ctx)->pipe;
+   struct pipe_resource *pt = st_get_texobj_resource(texObj);
+   const uint baseLevel = texObj->BaseLevel;
+   const uint lastLevel = pt->last_level;
+   const uint face = _mesa_tex_target_to_face(target);
+   uint dstLevel;
+   GLenum datatype;
+   GLuint comps;
+   GLboolean compressed;
+
+   if (ST_DEBUG & DEBUG_FALLBACK)
+      debug_printf("%s: fallback processing\n", __FUNCTION__);
+
+   assert(target != GL_TEXTURE_3D); /* not done yet */
+
+   compressed =
+      _mesa_is_format_compressed(texObj->Image[face][baseLevel]->TexFormat);
+
+   if (compressed) {
+      datatype = GL_UNSIGNED_BYTE;
+      comps = 4;
+   }
+   else {
+      _mesa_format_to_type_and_comps(texObj->Image[face][baseLevel]->TexFormat,
+                                     &datatype, &comps);
+      assert(comps > 0 && "bad texture format in fallback_generate_mipmap()");
+   }
+
+   for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
+      const uint srcLevel = dstLevel - 1;
+      const uint srcWidth = u_minify(pt->width0, srcLevel);
+      const uint srcHeight = u_minify(pt->height0, srcLevel);
+      const uint srcDepth = u_minify(pt->depth0, srcLevel);
+      const uint dstWidth = u_minify(pt->width0, dstLevel);
+      const uint dstHeight = u_minify(pt->height0, dstLevel);
+      const uint dstDepth = u_minify(pt->depth0, dstLevel);
+      struct pipe_transfer *srcTrans, *dstTrans;
+      const ubyte *srcData;
+      ubyte *dstData;
+      int srcStride, dstStride;
+
+      srcTrans = pipe_get_transfer(st_context(ctx)->pipe, pt, srcLevel,
+                                   face,
+                                   PIPE_TRANSFER_READ, 0, 0,
+                                   srcWidth, srcHeight);
+
+      dstTrans = pipe_get_transfer(st_context(ctx)->pipe, pt, dstLevel,
+                                   face,
+                                   PIPE_TRANSFER_WRITE, 0, 0,
+                                   dstWidth, dstHeight);
+
+      srcData = (ubyte *) pipe_transfer_map(pipe, srcTrans);
+      dstData = (ubyte *) pipe_transfer_map(pipe, dstTrans);
+
+      srcStride = srcTrans->stride / util_format_get_blocksize(srcTrans->resource->format);
+      dstStride = dstTrans->stride / util_format_get_blocksize(dstTrans->resource->format);
+
+     /* this cannot work correctly for 3d since it does
+        not respect layerStride. */
+      if (compressed) {
+         const enum pipe_format format = pt->format;
+         const uint bw = util_format_get_blockwidth(format);
+         const uint bh = util_format_get_blockheight(format);
+         const uint srcWidth2 = align(srcWidth, bw);
+         const uint srcHeight2 = align(srcHeight, bh);
+         const uint dstWidth2 = align(dstWidth, bw);
+         const uint dstHeight2 = align(dstHeight, bh);
+         uint8_t *srcTemp, *dstTemp;
+
+         assert(comps == 4);
+
+         srcTemp = malloc(srcWidth2 * srcHeight2 * comps + 000);
+         dstTemp = malloc(dstWidth2 * dstHeight2 * comps + 000);
+
+         /* decompress the src image: srcData -> srcTemp */
+         decompress_image(format, srcData, srcTemp, srcWidth, srcHeight, srcTrans->stride);
+
+         _mesa_generate_mipmap_level(target, datatype, comps,
+                                     0 /*border*/,
+                                     srcWidth2, srcHeight2, srcDepth,
+                                     srcTemp,
+                                     srcWidth2, /* stride in texels */
+                                     dstWidth2, dstHeight2, dstDepth,
+                                     dstTemp,
+                                     dstWidth2); /* stride in texels */
+
+         /* compress the new image: dstTemp -> dstData */
+         compress_image(format, dstTemp, dstData, dstWidth, dstHeight, dstTrans->stride);
+
+         free(srcTemp);
+         free(dstTemp);
+      }
+      else {
+         _mesa_generate_mipmap_level(target, datatype, comps,
+                                     0 /*border*/,
+                                     srcWidth, srcHeight, srcDepth,
+                                     srcData,
+                                     srcStride, /* stride in texels */
+                                     dstWidth, dstHeight, dstDepth,
+                                     dstData,
+                                     dstStride); /* stride in texels */
+      }
+
+      pipe_transfer_unmap(pipe, srcTrans);
+      pipe_transfer_unmap(pipe, dstTrans);
+
+      pipe->transfer_destroy(pipe, srcTrans);
+      pipe->transfer_destroy(pipe, dstTrans);
+   }
+}
+
+
+/**
+ * Compute the expected number of mipmap levels in the texture given
+ * the width/height/depth of the base image and the GL_TEXTURE_BASE_LEVEL/
+ * GL_TEXTURE_MAX_LEVEL settings.  This will tell us how many mipmap
+ * levels should be generated.
+ */
+static GLuint
+compute_num_levels(struct gl_context *ctx,
+                   struct gl_texture_object *texObj,
+                   GLenum target)
+{
+   if (target == GL_TEXTURE_RECTANGLE_ARB) {
+      return 1;
+   }
+   else {
+      const struct gl_texture_image *baseImage = 
+         _mesa_get_tex_image(ctx, texObj, target, texObj->BaseLevel);
+      GLuint size, numLevels;
+
+      size = MAX2(baseImage->Width2, baseImage->Height2);
+      size = MAX2(size, baseImage->Depth2);
+
+      numLevels = texObj->BaseLevel;
+
+      while (size > 0) {
+         numLevels++;
+         size >>= 1;
+      }
+
+      numLevels = MIN2(numLevels, texObj->MaxLevel + 1);
+
+      assert(numLevels >= 1);
+
+      return numLevels;
+   }
+}
+
+
+/**
+ * Called via ctx->Driver.GenerateMipmap().
+ */
+void
+st_generate_mipmap(struct gl_context *ctx, GLenum target,
+                   struct gl_texture_object *texObj)
+{
+   struct st_context *st = st_context(ctx);
+   struct st_texture_object *stObj = st_texture_object(texObj);
+   struct pipe_resource *pt = st_get_texobj_resource(texObj);
+   const uint baseLevel = texObj->BaseLevel;
+   uint lastLevel;
+   uint dstLevel;
+
+   if (!pt)
+      return;
+
+   /* not sure if this ultimately actually should work,
+      but we're not supporting multisampled textures yet. */
+   assert(pt->nr_samples < 2);
+
+   /* find expected last mipmap level to generate*/
+   lastLevel = compute_num_levels(ctx, texObj, target) - 1;
+
+   if (lastLevel == 0)
+      return;
+
+   if (pt->last_level < lastLevel) {
+      /* The current gallium texture doesn't have space for all the
+       * mipmap levels we need to generate.  So allocate a new texture.
+       */
+      struct pipe_resource *oldTex = stObj->pt;
+
+      /* create new texture with space for more levels */
+      stObj->pt = st_texture_create(st,
+                                    oldTex->target,
+                                    oldTex->format,
+                                    lastLevel,
+                                    oldTex->width0,
+                                    oldTex->height0,
+                                    oldTex->depth0,
+                                    oldTex->array_size,
+                                    oldTex->bind);
+
+      /* The texture isn't in a "complete" state yet so set the expected
+       * lastLevel here, since it won't get done in st_finalize_texture().
+       */
+      stObj->lastLevel = lastLevel;
+
+      /* This will copy the old texture's base image into the new texture
+       * which we just allocated.
+       */
+      st_finalize_texture(ctx, st->pipe, texObj);
+
+      /* release the old tex (will likely be freed too) */
+      pipe_resource_reference(&oldTex, NULL);
+      pipe_sampler_view_reference(&stObj->sampler_view, NULL);
+
+      pt = stObj->pt;
+   }
+   else {
+      /* Make sure that the base texture image data is present in the
+       * texture buffer.
+       */
+      st_finalize_texture(ctx, st->pipe, texObj);
+   }
+
+   assert(pt->last_level >= lastLevel);
+
+   /* Try to generate the mipmap by rendering/texturing.  If that fails,
+    * use the software fallback.
+    */
+   if (!st_render_mipmap(st, target, stObj, baseLevel, lastLevel)) {
+      /* since the util code actually also has a fallback, should
+         probably make it never fail and kill this */
+      fallback_generate_mipmap(ctx, target, texObj);
+   }
+
+   /* Fill in the Mesa gl_texture_image fields */
+   for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
+      const uint srcLevel = dstLevel - 1;
+      const struct gl_texture_image *srcImage
+         = _mesa_get_tex_image(ctx, texObj, target, srcLevel);
+      struct gl_texture_image *dstImage;
+      struct st_texture_image *stImage;
+      uint dstWidth = u_minify(pt->width0, dstLevel);
+      uint dstHeight = u_minify(pt->height0, dstLevel);
+      uint dstDepth = u_minify(pt->depth0, dstLevel); 
+      uint border = srcImage->Border;
+
+      dstImage = _mesa_get_tex_image(ctx, texObj, target, dstLevel);
+      if (!dstImage) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "generating mipmaps");
+         return;
+      }
+
+      /* Free old image data */
+      if (dstImage->Data)
+         ctx->Driver.FreeTexImageData(ctx, dstImage);
+
+      /* initialize new image */
+      _mesa_init_teximage_fields(ctx, target, dstImage, dstWidth, dstHeight,
+                                 dstDepth, border, srcImage->InternalFormat,
+                                 srcImage->TexFormat);
+
+      stImage = st_texture_image(dstImage);
+      stImage->level = dstLevel;
+
+      pipe_resource_reference(&stImage->pt, pt);
+   }
+}
diff --git a/mesalib/src/mesa/tnl/t_draw.c b/mesalib/src/mesa/tnl/t_draw.c
index 741f0ed3f..b1967e654 100644
--- a/mesalib/src/mesa/tnl/t_draw.c
+++ b/mesalib/src/mesa/tnl/t_draw.c
@@ -1,494 +1,534 @@
-/*
- * Mesa 3-D graphics library
- * Version:  7.1
- *
- * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Keith Whitwell <keith@tungstengraphics.com>
- */
-
-#include "main/glheader.h"
-#include "main/condrender.h"
-#include "main/context.h"
-#include "main/imports.h"
-#include "main/mtypes.h"
-#include "main/macros.h"
-#include "main/enums.h"
-
-#include "t_context.h"
-#include "tnl.h"
-
-
-
-static GLubyte *get_space(struct gl_context *ctx, GLuint bytes)
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   GLubyte *space = malloc(bytes);
-   
-   tnl->block[tnl->nr_blocks++] = space;
-   return space;
-}
-
-
-static void free_space(struct gl_context *ctx)
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   GLuint i;
-   for (i = 0; i < tnl->nr_blocks; i++)
-      free(tnl->block[i]);
-   tnl->nr_blocks = 0;
-}
-
-
-/* Convert the incoming array to GLfloats.  Understands the
- * array->Normalized flag and selects the correct conversion method.
- */
-#define CONVERT( TYPE, MACRO ) do {		\
-   GLuint i, j;					\
-   if (input->Normalized) {			\
-      for (i = 0; i < count; i++) {		\
-	 const TYPE *in = (TYPE *)ptr;		\
-	 for (j = 0; j < sz; j++) {		\
-	    *fptr++ = MACRO(*in);		\
-	    in++;				\
-	 }					\
-	 ptr += input->StrideB;			\
-      }						\
-   } else {					\
-      for (i = 0; i < count; i++) {		\
-	 const TYPE *in = (TYPE *)ptr;		\
-	 for (j = 0; j < sz; j++) {		\
-	    *fptr++ = (GLfloat)(*in);		\
-	    in++;				\
-	 }					\
-	 ptr += input->StrideB;			\
-      }						\
-   }						\
-} while (0)
-
-
-/**
- * Convert array of BGRA/GLubyte[4] values to RGBA/float[4]
- * \param ptr  input/ubyte array
- * \param fptr  output/float array
- */
-static void
-convert_bgra_to_float(const struct gl_client_array *input,
-                      const GLubyte *ptr, GLfloat *fptr,
-                      GLuint count )
-{
-   GLuint i;
-   assert(input->Normalized);
-   assert(input->Size == 4);
-   for (i = 0; i < count; i++) {
-      const GLubyte *in = (GLubyte *) ptr;  /* in is in BGRA order */
-      *fptr++ = UBYTE_TO_FLOAT(in[2]);  /* red */
-      *fptr++ = UBYTE_TO_FLOAT(in[1]);  /* green */
-      *fptr++ = UBYTE_TO_FLOAT(in[0]);  /* blue */
-      *fptr++ = UBYTE_TO_FLOAT(in[3]);  /* alpha */
-      ptr += input->StrideB;
-   }
-}
-
-static void
-convert_half_to_float(const struct gl_client_array *input,
-		      const GLubyte *ptr, GLfloat *fptr,
-		      GLuint count, GLuint sz)
-{
-   GLuint i, j;
-
-   for (i = 0; i < count; i++) {
-      GLhalfARB *in = (GLhalfARB *)ptr;
-
-      for (j = 0; j < sz; j++) {
-	 *fptr++ = _mesa_half_to_float(in[j]);
-      }
-      ptr += input->StrideB;
-   }
-}
-
-/* Adjust pointer to point at first requested element, convert to
- * floating point, populate VB->AttribPtr[].
- */
-static void _tnl_import_array( struct gl_context *ctx,
-			       GLuint attrib,
-			       GLuint count,
-			       const struct gl_client_array *input,
-			       const GLubyte *ptr )
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   struct vertex_buffer *VB = &tnl->vb;
-   GLuint stride = input->StrideB;
-
-   if (input->Type != GL_FLOAT) {
-      const GLuint sz = input->Size;
-      GLubyte *buf = get_space(ctx, count * sz * sizeof(GLfloat));
-      GLfloat *fptr = (GLfloat *)buf;
-
-      switch (input->Type) {
-      case GL_BYTE: 
-	 CONVERT(GLbyte, BYTE_TO_FLOAT); 
-	 break;
-      case GL_UNSIGNED_BYTE: 
-         if (input->Format == GL_BGRA) {
-            /* See GL_EXT_vertex_array_bgra */
-            convert_bgra_to_float(input, ptr, fptr, count);
-         }
-         else {
-            CONVERT(GLubyte, UBYTE_TO_FLOAT); 
-         }
-	 break;
-      case GL_SHORT: 
-	 CONVERT(GLshort, SHORT_TO_FLOAT); 
-	 break;
-      case GL_UNSIGNED_SHORT: 
-	 CONVERT(GLushort, USHORT_TO_FLOAT); 
-	 break;
-      case GL_INT: 
-	 CONVERT(GLint, INT_TO_FLOAT); 
-	 break;
-      case GL_UNSIGNED_INT: 
-	 CONVERT(GLuint, UINT_TO_FLOAT); 
-	 break;
-      case GL_DOUBLE: 
-	 CONVERT(GLdouble, (GLfloat)); 
-	 break;
-      case GL_HALF_FLOAT:
-	 convert_half_to_float(input, ptr, fptr, count, sz);
-	 break;
-      default:
-	 assert(0);
-	 break;
-      }
-
-      ptr = buf;
-      stride = sz * sizeof(GLfloat);
-   }
-
-   VB->AttribPtr[attrib] = &tnl->tmp_inputs[attrib];
-   VB->AttribPtr[attrib]->data = (GLfloat (*)[4])ptr;
-   VB->AttribPtr[attrib]->start = (GLfloat *)ptr;
-   VB->AttribPtr[attrib]->count = count;
-   VB->AttribPtr[attrib]->stride = stride;
-   VB->AttribPtr[attrib]->size = input->Size;
-
-   /* This should die, but so should the whole GLvector4f concept: 
-    */
-   VB->AttribPtr[attrib]->flags = (((1<<input->Size)-1) | 
-				   VEC_NOT_WRITEABLE |
-				   (stride == 4*sizeof(GLfloat) ? 0 : VEC_BAD_STRIDE));
-   
-   VB->AttribPtr[attrib]->storage = NULL;
-}
-
-#define CLIPVERTS  ((6 + MAX_CLIP_PLANES) * 2)
-
-
-static GLboolean *_tnl_import_edgeflag( struct gl_context *ctx,
-					const GLvector4f *input,
-					GLuint count)
-{
-   const GLubyte *ptr = (const GLubyte *)input->data;
-   const GLuint stride = input->stride;
-   GLboolean *space = (GLboolean *)get_space(ctx, count + CLIPVERTS);
-   GLboolean *bptr = space;
-   GLuint i;
-
-   for (i = 0; i < count; i++) {
-      *bptr++ = ((GLfloat *)ptr)[0] == 1.0;
-      ptr += stride;
-   }
-
-   return space;
-}
-
-
-static void bind_inputs( struct gl_context *ctx, 
-			 const struct gl_client_array *inputs[],
-			 GLint count,
-			 struct gl_buffer_object **bo,
-			 GLuint *nr_bo )
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   struct vertex_buffer *VB = &tnl->vb;
-   GLuint i;
-
-   /* Map all the VBOs
-    */
-   for (i = 0; i < VERT_ATTRIB_MAX; i++) {
-      const void *ptr;
-
-      if (inputs[i]->BufferObj->Name) { 
-	 if (!inputs[i]->BufferObj->Pointer) {
-	    bo[*nr_bo] = inputs[i]->BufferObj;
-	    (*nr_bo)++;
-	    ctx->Driver.MapBuffer(ctx, 
-				  GL_ARRAY_BUFFER,
-				  GL_READ_ONLY_ARB,
-				  inputs[i]->BufferObj);
-	    
-	    assert(inputs[i]->BufferObj->Pointer);
-	 }
-	 
-	 ptr = ADD_POINTERS(inputs[i]->BufferObj->Pointer,
-			    inputs[i]->Ptr);
-      }
-      else
-	 ptr = inputs[i]->Ptr;
-
-      /* Just make sure the array is floating point, otherwise convert to
-       * temporary storage.  
-       *
-       * XXX: remove the GLvector4f type at some stage and just use
-       * client arrays.
-       */
-      _tnl_import_array(ctx, i, count, inputs[i], ptr);
-   }
-
-   /* We process only the vertices between min & max index:
-    */
-   VB->Count = count;
-
-   /* These should perhaps be part of _TNL_ATTRIB_* */
-   VB->BackfaceColorPtr = NULL;
-   VB->BackfaceIndexPtr = NULL;
-   VB->BackfaceSecondaryColorPtr = NULL;
-
-   /* Clipping and drawing code still requires this to be a packed
-    * array of ubytes which can be written into.  TODO: Fix and
-    * remove.
-    */
-   if (ctx->Polygon.FrontMode != GL_FILL ||
-       ctx->Polygon.BackMode != GL_FILL)
-   {
-      VB->EdgeFlag = _tnl_import_edgeflag( ctx, 
-					   VB->AttribPtr[_TNL_ATTRIB_EDGEFLAG],
-					   VB->Count );
-   }
-   else {
-      /* the data previously pointed to by EdgeFlag may have been freed */
-      VB->EdgeFlag = NULL;
-   }
-}
-
-
-/* Translate indices to GLuints and store in VB->Elts.
- */
-static void bind_indices( struct gl_context *ctx,
-			  const struct _mesa_index_buffer *ib,
-			  struct gl_buffer_object **bo,
-			  GLuint *nr_bo)
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   struct vertex_buffer *VB = &tnl->vb;
-   GLuint i;
-   void *ptr;
-
-   if (!ib) {
-      VB->Elts = NULL;
-      return;
-   }
-
-   if (ib->obj->Name && !ib->obj->Pointer) {
-      bo[*nr_bo] = ib->obj;
-      (*nr_bo)++;
-      ctx->Driver.MapBuffer(ctx, 
-			    GL_ELEMENT_ARRAY_BUFFER,
-			    GL_READ_ONLY_ARB,
-			    ib->obj);
-
-      assert(ib->obj->Pointer);
-   }
-
-   ptr = ADD_POINTERS(ib->obj->Pointer, ib->ptr);
-
-   if (ib->type == GL_UNSIGNED_INT && VB->Primitive[0].basevertex == 0) {
-      VB->Elts = (GLuint *) ptr;
-   }
-   else {
-      GLuint *elts = (GLuint *)get_space(ctx, ib->count * sizeof(GLuint));
-      VB->Elts = elts;
-
-      if (ib->type == GL_UNSIGNED_INT) {
-	 const GLuint *in = (GLuint *)ptr;
-	 for (i = 0; i < ib->count; i++)
-	    *elts++ = (GLuint)(*in++) + VB->Primitive[0].basevertex;
-      }
-      else if (ib->type == GL_UNSIGNED_SHORT) {
-	 const GLushort *in = (GLushort *)ptr;
-	 for (i = 0; i < ib->count; i++) 
-	    *elts++ = (GLuint)(*in++) + VB->Primitive[0].basevertex;
-      }
-      else {
-	 const GLubyte *in = (GLubyte *)ptr;
-	 for (i = 0; i < ib->count; i++) 
-	    *elts++ = (GLuint)(*in++) + VB->Primitive[0].basevertex;
-      }
-   }
-}
-
-static void bind_prims( struct gl_context *ctx,
-			const struct _mesa_prim *prim,
-			GLuint nr_prims )
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   struct vertex_buffer *VB = &tnl->vb;
-
-   VB->Primitive = prim;
-   VB->PrimitiveCount = nr_prims;
-}
-
-static void unmap_vbos( struct gl_context *ctx,
-			struct gl_buffer_object **bo,
-			GLuint nr_bo )
-{
-   GLuint i;
-   for (i = 0; i < nr_bo; i++) { 
-      ctx->Driver.UnmapBuffer(ctx, 
-			      0, /* target -- I don't see why this would be needed */
-			      bo[i]);
-   }
-}
-
-
-void _tnl_vbo_draw_prims(struct gl_context *ctx,
-			 const struct gl_client_array *arrays[],
-			 const struct _mesa_prim *prim,
-			 GLuint nr_prims,
-			 const struct _mesa_index_buffer *ib,
-			 GLboolean index_bounds_valid,
-			 GLuint min_index,
-			 GLuint max_index)
-{
-   if (!index_bounds_valid)
-      vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index);
-
-   _tnl_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
-}
-
-/* This is the main entrypoint into the slimmed-down software tnl
- * module.  In a regular swtnl driver, this can be plugged straight
- * into the vbo->Driver.DrawPrims() callback.
- */
-void _tnl_draw_prims( struct gl_context *ctx,
-		      const struct gl_client_array *arrays[],
-		      const struct _mesa_prim *prim,
-		      GLuint nr_prims,
-		      const struct _mesa_index_buffer *ib,
-		      GLuint min_index,
-		      GLuint max_index)
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   const GLuint TEST_SPLIT = 0;
-   const GLint max = TEST_SPLIT ? 8 : tnl->vb.Size - MAX_CLIPPED_VERTICES;
-   GLint max_basevertex = prim->basevertex;
-   GLuint i;
-
-   /* Mesa core state should have been validated already */
-   assert(ctx->NewState == 0x0);
-
-   if (!_mesa_check_conditional_render(ctx))
-      return; /* don't draw */
-
-   for (i = 1; i < nr_prims; i++)
-      max_basevertex = MAX2(max_basevertex, prim[i].basevertex);
-
-   if (0)
-   {
-      printf("%s %d..%d\n", __FUNCTION__, min_index, max_index);
-      for (i = 0; i < nr_prims; i++)
-	 printf("prim %d: %s start %d count %d\n", i, 
-		_mesa_lookup_enum_by_nr(prim[i].mode),
-		prim[i].start,
-		prim[i].count);
-   }
-
-   if (min_index) {
-      /* We always translate away calls with min_index != 0. 
-       */
-      vbo_rebase_prims( ctx, arrays, prim, nr_prims, ib, 
-			min_index, max_index,
-			_tnl_vbo_draw_prims );
-      return;
-   }
-   else if ((GLint)max_index + max_basevertex > max) {
-      /* The software TNL pipeline has a fixed amount of storage for
-       * vertices and it is necessary to split incoming drawing commands
-       * if they exceed that limit.
-       */
-      struct split_limits limits;
-      limits.max_verts = max;
-      limits.max_vb_size = ~0;
-      limits.max_indices = ~0;
-
-      /* This will split the buffers one way or another and
-       * recursively call back into this function.
-       */
-      vbo_split_prims( ctx, arrays, prim, nr_prims, ib, 
-		       0, max_index + prim->basevertex,
-		       _tnl_vbo_draw_prims,
-		       &limits );
-   }
-   else {
-      /* May need to map a vertex buffer object for every attribute plus
-       * one for the index buffer.
-       */
-      struct gl_buffer_object *bo[VERT_ATTRIB_MAX + 1];
-      GLuint nr_bo = 0;
-      GLuint inst;
-
-      for (i = 0; i < nr_prims;) {
-	 GLuint this_nr_prims;
-
-	 /* Our SW TNL pipeline doesn't handle basevertex yet, so bind_indices
-	  * will rebase the elements to the basevertex, and we'll only
-	  * emit strings of prims with the same basevertex in one draw call.
-	  */
-	 for (this_nr_prims = 1; i + this_nr_prims < nr_prims;
-	      this_nr_prims++) {
-	    if (prim[i].basevertex != prim[i + this_nr_prims].basevertex)
-	       break;
-	 }
-
-         assert(prim[i].num_instances > 0);
-
-	 /* Binding inputs may imply mapping some vertex buffer objects.
-	  * They will need to be unmapped below.
-	  */
-         for (inst = 0; inst < prim[i].num_instances; inst++) {
-
-            bind_prims(ctx, &prim[i], this_nr_prims);
-            bind_inputs(ctx, arrays, max_index + prim[i].basevertex + 1,
-                        bo, &nr_bo);
-            bind_indices(ctx, ib, bo, &nr_bo);
-
-            tnl->CurInstance = inst;
-            TNL_CONTEXT(ctx)->Driver.RunPipeline(ctx);
-
-            unmap_vbos(ctx, bo, nr_bo);
-            free_space(ctx);
-         }
-
-	 i += this_nr_prims;
-      }
-   }
-}
-
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.1
+ *
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "main/glheader.h"
+#include "main/condrender.h"
+#include "main/context.h"
+#include "main/imports.h"
+#include "main/mtypes.h"
+#include "main/macros.h"
+#include "main/enums.h"
+
+#include "t_context.h"
+#include "tnl.h"
+
+
+
+static GLubyte *get_space(struct gl_context *ctx, GLuint bytes)
+{
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   GLubyte *space = malloc(bytes);
+   
+   tnl->block[tnl->nr_blocks++] = space;
+   return space;
+}
+
+
+static void free_space(struct gl_context *ctx)
+{
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   GLuint i;
+   for (i = 0; i < tnl->nr_blocks; i++)
+      free(tnl->block[i]);
+   tnl->nr_blocks = 0;
+}
+
+
+/* Convert the incoming array to GLfloats.  Understands the
+ * array->Normalized flag and selects the correct conversion method.
+ */
+#define CONVERT( TYPE, MACRO ) do {		\
+   GLuint i, j;					\
+   if (input->Normalized) {			\
+      for (i = 0; i < count; i++) {		\
+	 const TYPE *in = (TYPE *)ptr;		\
+	 for (j = 0; j < sz; j++) {		\
+	    *fptr++ = MACRO(*in);		\
+	    in++;				\
+	 }					\
+	 ptr += input->StrideB;			\
+      }						\
+   } else {					\
+      for (i = 0; i < count; i++) {		\
+	 const TYPE *in = (TYPE *)ptr;		\
+	 for (j = 0; j < sz; j++) {		\
+	    *fptr++ = (GLfloat)(*in);		\
+	    in++;				\
+	 }					\
+	 ptr += input->StrideB;			\
+      }						\
+   }						\
+} while (0)
+
+
+/**
+ * Convert array of BGRA/GLubyte[4] values to RGBA/float[4]
+ * \param ptr  input/ubyte array
+ * \param fptr  output/float array
+ */
+static void
+convert_bgra_to_float(const struct gl_client_array *input,
+                      const GLubyte *ptr, GLfloat *fptr,
+                      GLuint count )
+{
+   GLuint i;
+   assert(input->Normalized);
+   assert(input->Size == 4);
+   for (i = 0; i < count; i++) {
+      const GLubyte *in = (GLubyte *) ptr;  /* in is in BGRA order */
+      *fptr++ = UBYTE_TO_FLOAT(in[2]);  /* red */
+      *fptr++ = UBYTE_TO_FLOAT(in[1]);  /* green */
+      *fptr++ = UBYTE_TO_FLOAT(in[0]);  /* blue */
+      *fptr++ = UBYTE_TO_FLOAT(in[3]);  /* alpha */
+      ptr += input->StrideB;
+   }
+}
+
+static void
+convert_half_to_float(const struct gl_client_array *input,
+		      const GLubyte *ptr, GLfloat *fptr,
+		      GLuint count, GLuint sz)
+{
+   GLuint i, j;
+
+   for (i = 0; i < count; i++) {
+      GLhalfARB *in = (GLhalfARB *)ptr;
+
+      for (j = 0; j < sz; j++) {
+	 *fptr++ = _mesa_half_to_float(in[j]);
+      }
+      ptr += input->StrideB;
+   }
+}
+
+/**
+ * \brief Convert fixed-point to floating-point.
+ *
+ * In OpenGL, a fixed-point number is a "signed 2's complement 16.16 scaled
+ * integer" (Table 2.2 of the OpenGL ES 2.0 spec).
+ *
+ * If the buffer has the \c normalized flag set, the formula
+ *     \code normalize(x) := (2*x + 1) / (2^16 - 1) \endcode
+ * is used to map the fixed-point numbers into the range [-1, 1].
+ */
+static void
+convert_fixed_to_float(const struct gl_client_array *input,
+                       const GLubyte *ptr, GLfloat *fptr,
+                       GLuint count)
+{
+   GLuint i, j;
+   const GLint size = input->Size;
+
+   if (input->Normalized) {
+      for (i = 0; i < count; ++i) {
+         const GLfixed *in = (GLfixed *) ptr;
+         for (j = 0; j < size; ++j) {
+            *fptr++ = (GLfloat) (2 * in[j] + 1) / (GLfloat) ((1 << 16) - 1);
+         }
+         ptr += input->StrideB;
+      }
+   } else {
+      for (i = 0; i < count; ++i) {
+         const GLfixed *in = (GLfixed *) ptr;
+         for (j = 0; j < size; ++j) {
+            *fptr++ = in[j] / (GLfloat) (1 << 16);
+         }
+         ptr += input->StrideB;
+      }
+   }
+}
+
+/* Adjust pointer to point at first requested element, convert to
+ * floating point, populate VB->AttribPtr[].
+ */
+static void _tnl_import_array( struct gl_context *ctx,
+			       GLuint attrib,
+			       GLuint count,
+			       const struct gl_client_array *input,
+			       const GLubyte *ptr )
+{
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   struct vertex_buffer *VB = &tnl->vb;
+   GLuint stride = input->StrideB;
+
+   if (input->Type != GL_FLOAT) {
+      const GLuint sz = input->Size;
+      GLubyte *buf = get_space(ctx, count * sz * sizeof(GLfloat));
+      GLfloat *fptr = (GLfloat *)buf;
+
+      switch (input->Type) {
+      case GL_BYTE: 
+	 CONVERT(GLbyte, BYTE_TO_FLOAT); 
+	 break;
+      case GL_UNSIGNED_BYTE: 
+         if (input->Format == GL_BGRA) {
+            /* See GL_EXT_vertex_array_bgra */
+            convert_bgra_to_float(input, ptr, fptr, count);
+         }
+         else {
+            CONVERT(GLubyte, UBYTE_TO_FLOAT); 
+         }
+	 break;
+      case GL_SHORT: 
+	 CONVERT(GLshort, SHORT_TO_FLOAT); 
+	 break;
+      case GL_UNSIGNED_SHORT: 
+	 CONVERT(GLushort, USHORT_TO_FLOAT); 
+	 break;
+      case GL_INT: 
+	 CONVERT(GLint, INT_TO_FLOAT); 
+	 break;
+      case GL_UNSIGNED_INT: 
+	 CONVERT(GLuint, UINT_TO_FLOAT); 
+	 break;
+      case GL_DOUBLE: 
+	 CONVERT(GLdouble, (GLfloat)); 
+	 break;
+      case GL_HALF_FLOAT:
+	 convert_half_to_float(input, ptr, fptr, count, sz);
+	 break;
+      case GL_FIXED:
+         convert_fixed_to_float(input, ptr, fptr, count);
+         break;
+      default:
+	 assert(0);
+	 break;
+      }
+
+      ptr = buf;
+      stride = sz * sizeof(GLfloat);
+   }
+
+   VB->AttribPtr[attrib] = &tnl->tmp_inputs[attrib];
+   VB->AttribPtr[attrib]->data = (GLfloat (*)[4])ptr;
+   VB->AttribPtr[attrib]->start = (GLfloat *)ptr;
+   VB->AttribPtr[attrib]->count = count;
+   VB->AttribPtr[attrib]->stride = stride;
+   VB->AttribPtr[attrib]->size = input->Size;
+
+   /* This should die, but so should the whole GLvector4f concept: 
+    */
+   VB->AttribPtr[attrib]->flags = (((1<<input->Size)-1) | 
+				   VEC_NOT_WRITEABLE |
+				   (stride == 4*sizeof(GLfloat) ? 0 : VEC_BAD_STRIDE));
+   
+   VB->AttribPtr[attrib]->storage = NULL;
+}
+
+#define CLIPVERTS  ((6 + MAX_CLIP_PLANES) * 2)
+
+
+static GLboolean *_tnl_import_edgeflag( struct gl_context *ctx,
+					const GLvector4f *input,
+					GLuint count)
+{
+   const GLubyte *ptr = (const GLubyte *)input->data;
+   const GLuint stride = input->stride;
+   GLboolean *space = (GLboolean *)get_space(ctx, count + CLIPVERTS);
+   GLboolean *bptr = space;
+   GLuint i;
+
+   for (i = 0; i < count; i++) {
+      *bptr++ = ((GLfloat *)ptr)[0] == 1.0;
+      ptr += stride;
+   }
+
+   return space;
+}
+
+
+static void bind_inputs( struct gl_context *ctx, 
+			 const struct gl_client_array *inputs[],
+			 GLint count,
+			 struct gl_buffer_object **bo,
+			 GLuint *nr_bo )
+{
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   struct vertex_buffer *VB = &tnl->vb;
+   GLuint i;
+
+   /* Map all the VBOs
+    */
+   for (i = 0; i < VERT_ATTRIB_MAX; i++) {
+      const void *ptr;
+
+      if (inputs[i]->BufferObj->Name) { 
+	 if (!inputs[i]->BufferObj->Pointer) {
+	    bo[*nr_bo] = inputs[i]->BufferObj;
+	    (*nr_bo)++;
+	    ctx->Driver.MapBuffer(ctx, 
+				  GL_ARRAY_BUFFER,
+				  GL_READ_ONLY_ARB,
+				  inputs[i]->BufferObj);
+	    
+	    assert(inputs[i]->BufferObj->Pointer);
+	 }
+	 
+	 ptr = ADD_POINTERS(inputs[i]->BufferObj->Pointer,
+			    inputs[i]->Ptr);
+      }
+      else
+	 ptr = inputs[i]->Ptr;
+
+      /* Just make sure the array is floating point, otherwise convert to
+       * temporary storage.  
+       *
+       * XXX: remove the GLvector4f type at some stage and just use
+       * client arrays.
+       */
+      _tnl_import_array(ctx, i, count, inputs[i], ptr);
+   }
+
+   /* We process only the vertices between min & max index:
+    */
+   VB->Count = count;
+
+   /* These should perhaps be part of _TNL_ATTRIB_* */
+   VB->BackfaceColorPtr = NULL;
+   VB->BackfaceIndexPtr = NULL;
+   VB->BackfaceSecondaryColorPtr = NULL;
+
+   /* Clipping and drawing code still requires this to be a packed
+    * array of ubytes which can be written into.  TODO: Fix and
+    * remove.
+    */
+   if (ctx->Polygon.FrontMode != GL_FILL ||
+       ctx->Polygon.BackMode != GL_FILL)
+   {
+      VB->EdgeFlag = _tnl_import_edgeflag( ctx, 
+					   VB->AttribPtr[_TNL_ATTRIB_EDGEFLAG],
+					   VB->Count );
+   }
+   else {
+      /* the data previously pointed to by EdgeFlag may have been freed */
+      VB->EdgeFlag = NULL;
+   }
+}
+
+
+/* Translate indices to GLuints and store in VB->Elts.
+ */
+static void bind_indices( struct gl_context *ctx,
+			  const struct _mesa_index_buffer *ib,
+			  struct gl_buffer_object **bo,
+			  GLuint *nr_bo)
+{
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   struct vertex_buffer *VB = &tnl->vb;
+   GLuint i;
+   void *ptr;
+
+   if (!ib) {
+      VB->Elts = NULL;
+      return;
+   }
+
+   if (ib->obj->Name && !ib->obj->Pointer) {
+      bo[*nr_bo] = ib->obj;
+      (*nr_bo)++;
+      ctx->Driver.MapBuffer(ctx, 
+			    GL_ELEMENT_ARRAY_BUFFER,
+			    GL_READ_ONLY_ARB,
+			    ib->obj);
+
+      assert(ib->obj->Pointer);
+   }
+
+   ptr = ADD_POINTERS(ib->obj->Pointer, ib->ptr);
+
+   if (ib->type == GL_UNSIGNED_INT && VB->Primitive[0].basevertex == 0) {
+      VB->Elts = (GLuint *) ptr;
+   }
+   else {
+      GLuint *elts = (GLuint *)get_space(ctx, ib->count * sizeof(GLuint));
+      VB->Elts = elts;
+
+      if (ib->type == GL_UNSIGNED_INT) {
+	 const GLuint *in = (GLuint *)ptr;
+	 for (i = 0; i < ib->count; i++)
+	    *elts++ = (GLuint)(*in++) + VB->Primitive[0].basevertex;
+      }
+      else if (ib->type == GL_UNSIGNED_SHORT) {
+	 const GLushort *in = (GLushort *)ptr;
+	 for (i = 0; i < ib->count; i++) 
+	    *elts++ = (GLuint)(*in++) + VB->Primitive[0].basevertex;
+      }
+      else {
+	 const GLubyte *in = (GLubyte *)ptr;
+	 for (i = 0; i < ib->count; i++) 
+	    *elts++ = (GLuint)(*in++) + VB->Primitive[0].basevertex;
+      }
+   }
+}
+
+static void bind_prims( struct gl_context *ctx,
+			const struct _mesa_prim *prim,
+			GLuint nr_prims )
+{
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   struct vertex_buffer *VB = &tnl->vb;
+
+   VB->Primitive = prim;
+   VB->PrimitiveCount = nr_prims;
+}
+
+static void unmap_vbos( struct gl_context *ctx,
+			struct gl_buffer_object **bo,
+			GLuint nr_bo )
+{
+   GLuint i;
+   for (i = 0; i < nr_bo; i++) { 
+      ctx->Driver.UnmapBuffer(ctx, 
+			      0, /* target -- I don't see why this would be needed */
+			      bo[i]);
+   }
+}
+
+
+void _tnl_vbo_draw_prims(struct gl_context *ctx,
+			 const struct gl_client_array *arrays[],
+			 const struct _mesa_prim *prim,
+			 GLuint nr_prims,
+			 const struct _mesa_index_buffer *ib,
+			 GLboolean index_bounds_valid,
+			 GLuint min_index,
+			 GLuint max_index)
+{
+   if (!index_bounds_valid)
+      vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index);
+
+   _tnl_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+}
+
+/* This is the main entrypoint into the slimmed-down software tnl
+ * module.  In a regular swtnl driver, this can be plugged straight
+ * into the vbo->Driver.DrawPrims() callback.
+ */
+void _tnl_draw_prims( struct gl_context *ctx,
+		      const struct gl_client_array *arrays[],
+		      const struct _mesa_prim *prim,
+		      GLuint nr_prims,
+		      const struct _mesa_index_buffer *ib,
+		      GLuint min_index,
+		      GLuint max_index)
+{
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   const GLuint TEST_SPLIT = 0;
+   const GLint max = TEST_SPLIT ? 8 : tnl->vb.Size - MAX_CLIPPED_VERTICES;
+   GLint max_basevertex = prim->basevertex;
+   GLuint i;
+
+   /* Mesa core state should have been validated already */
+   assert(ctx->NewState == 0x0);
+
+   if (!_mesa_check_conditional_render(ctx))
+      return; /* don't draw */
+
+   for (i = 1; i < nr_prims; i++)
+      max_basevertex = MAX2(max_basevertex, prim[i].basevertex);
+
+   if (0)
+   {
+      printf("%s %d..%d\n", __FUNCTION__, min_index, max_index);
+      for (i = 0; i < nr_prims; i++)
+	 printf("prim %d: %s start %d count %d\n", i, 
+		_mesa_lookup_enum_by_nr(prim[i].mode),
+		prim[i].start,
+		prim[i].count);
+   }
+
+   if (min_index) {
+      /* We always translate away calls with min_index != 0. 
+       */
+      vbo_rebase_prims( ctx, arrays, prim, nr_prims, ib, 
+			min_index, max_index,
+			_tnl_vbo_draw_prims );
+      return;
+   }
+   else if ((GLint)max_index + max_basevertex > max) {
+      /* The software TNL pipeline has a fixed amount of storage for
+       * vertices and it is necessary to split incoming drawing commands
+       * if they exceed that limit.
+       */
+      struct split_limits limits;
+      limits.max_verts = max;
+      limits.max_vb_size = ~0;
+      limits.max_indices = ~0;
+
+      /* This will split the buffers one way or another and
+       * recursively call back into this function.
+       */
+      vbo_split_prims( ctx, arrays, prim, nr_prims, ib, 
+		       0, max_index + prim->basevertex,
+		       _tnl_vbo_draw_prims,
+		       &limits );
+   }
+   else {
+      /* May need to map a vertex buffer object for every attribute plus
+       * one for the index buffer.
+       */
+      struct gl_buffer_object *bo[VERT_ATTRIB_MAX + 1];
+      GLuint nr_bo = 0;
+      GLuint inst;
+
+      for (i = 0; i < nr_prims;) {
+	 GLuint this_nr_prims;
+
+	 /* Our SW TNL pipeline doesn't handle basevertex yet, so bind_indices
+	  * will rebase the elements to the basevertex, and we'll only
+	  * emit strings of prims with the same basevertex in one draw call.
+	  */
+	 for (this_nr_prims = 1; i + this_nr_prims < nr_prims;
+	      this_nr_prims++) {
+	    if (prim[i].basevertex != prim[i + this_nr_prims].basevertex)
+	       break;
+	 }
+
+         assert(prim[i].num_instances > 0);
+
+	 /* Binding inputs may imply mapping some vertex buffer objects.
+	  * They will need to be unmapped below.
+	  */
+         for (inst = 0; inst < prim[i].num_instances; inst++) {
+
+            bind_prims(ctx, &prim[i], this_nr_prims);
+            bind_inputs(ctx, arrays, max_index + prim[i].basevertex + 1,
+                        bo, &nr_bo);
+            bind_indices(ctx, ib, bo, &nr_bo);
+
+            tnl->CurInstance = inst;
+            TNL_CONTEXT(ctx)->Driver.RunPipeline(ctx);
+
+            unmap_vbos(ctx, bo, nr_bo);
+            free_space(ctx);
+         }
+
+	 i += this_nr_prims;
+      }
+   }
+}
+
diff --git a/mesalib/src/mesa/vbo/vbo_exec_array.c b/mesalib/src/mesa/vbo/vbo_exec_array.c
index 13b54d59c..6749541b7 100644
--- a/mesalib/src/mesa/vbo/vbo_exec_array.c
+++ b/mesalib/src/mesa/vbo/vbo_exec_array.c
@@ -1,1277 +1,1282 @@
-/**************************************************************************
- * 
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "main/glheader.h"
-#include "main/context.h"
-#include "main/state.h"
-#include "main/api_validate.h"
-#include "main/varray.h"
-#include "main/bufferobj.h"
-#include "main/enums.h"
-#include "main/macros.h"
-
-#include "vbo_context.h"
-
-
-/**
- * Compute min and max elements by scanning the index buffer for
- * glDraw[Range]Elements() calls.
- * If primitive restart is enabled, we need to ignore restart
- * indexes when computing min/max.
- */
-void
-vbo_get_minmax_index(struct gl_context *ctx,
-		     const struct _mesa_prim *prim,
-		     const struct _mesa_index_buffer *ib,
-		     GLuint *min_index, GLuint *max_index)
-{
-   const GLboolean restart = ctx->Array.PrimitiveRestart;
-   const GLuint restartIndex = ctx->Array.RestartIndex;
-   const GLuint count = prim->count;
-   const void *indices;
-   GLuint i;
-
-   if (_mesa_is_bufferobj(ib->obj)) {
-      const GLvoid *map =
-         ctx->Driver.MapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER_ARB,
-                               GL_READ_ONLY, ib->obj);
-      indices = ADD_POINTERS(map, ib->ptr);
-   } else {
-      indices = ib->ptr;
-   }
-
-   switch (ib->type) {
-   case GL_UNSIGNED_INT: {
-      const GLuint *ui_indices = (const GLuint *)indices;
-      GLuint max_ui = 0;
-      GLuint min_ui = ~0U;
-      if (restart) {
-         for (i = 0; i < count; i++) {
-            if (ui_indices[i] != restartIndex) {
-               if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
-               if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
-            }
-         }
-      }
-      else {
-         for (i = 0; i < count; i++) {
-            if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
-            if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
-         }
-      }
-      *min_index = min_ui;
-      *max_index = max_ui;
-      break;
-   }
-   case GL_UNSIGNED_SHORT: {
-      const GLushort *us_indices = (const GLushort *)indices;
-      GLuint max_us = 0;
-      GLuint min_us = ~0U;
-      if (restart) {
-         for (i = 0; i < count; i++) {
-            if (us_indices[i] != restartIndex) {
-               if (us_indices[i] > max_us) max_us = us_indices[i];
-               if (us_indices[i] < min_us) min_us = us_indices[i];
-            }
-         }
-      }
-      else {
-         for (i = 0; i < count; i++) {
-            if (us_indices[i] > max_us) max_us = us_indices[i];
-            if (us_indices[i] < min_us) min_us = us_indices[i];
-         }
-      }
-      *min_index = min_us;
-      *max_index = max_us;
-      break;
-   }
-   case GL_UNSIGNED_BYTE: {
-      const GLubyte *ub_indices = (const GLubyte *)indices;
-      GLuint max_ub = 0;
-      GLuint min_ub = ~0U;
-      if (restart) {
-         for (i = 0; i < count; i++) {
-            if (ub_indices[i] != restartIndex) {
-               if (ub_indices[i] > max_ub) max_ub = ub_indices[i];
-               if (ub_indices[i] < min_ub) min_ub = ub_indices[i];
-            }
-         }
-      }
-      else {
-         for (i = 0; i < count; i++) {
-            if (ub_indices[i] > max_ub) max_ub = ub_indices[i];
-            if (ub_indices[i] < min_ub) min_ub = ub_indices[i];
-         }
-      }
-      *min_index = min_ub;
-      *max_index = max_ub;
-      break;
-   }
-   default:
-      assert(0);
-      break;
-   }
-
-   if (_mesa_is_bufferobj(ib->obj)) {
-      ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER_ARB, ib->obj);
-   }
-}
-
-
-/**
- * Check that element 'j' of the array has reasonable data.
- * Map VBO if needed.
- * For debugging purposes; not normally used.
- */
-static void
-check_array_data(struct gl_context *ctx, struct gl_client_array *array,
-                 GLuint attrib, GLuint j)
-{
-   if (array->Enabled) {
-      const void *data = array->Ptr;
-      if (_mesa_is_bufferobj(array->BufferObj)) {
-         if (!array->BufferObj->Pointer) {
-            /* need to map now */
-            array->BufferObj->Pointer =
-               ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER_ARB,
-                                     GL_READ_ONLY, array->BufferObj);
-         }
-         data = ADD_POINTERS(data, array->BufferObj->Pointer);
-      }
-      switch (array->Type) {
-      case GL_FLOAT:
-         {
-            GLfloat *f = (GLfloat *) ((GLubyte *) data + array->StrideB * j);
-            GLint k;
-            for (k = 0; k < array->Size; k++) {
-               if (IS_INF_OR_NAN(f[k]) ||
-                   f[k] >= 1.0e20 || f[k] <= -1.0e10) {
-                  printf("Bad array data:\n");
-                  printf("  Element[%u].%u = %f\n", j, k, f[k]);
-                  printf("  Array %u at %p\n", attrib, (void* ) array);
-                  printf("  Type 0x%x, Size %d, Stride %d\n",
-			 array->Type, array->Size, array->Stride);
-                  printf("  Address/offset %p in Buffer Object %u\n",
-			 array->Ptr, array->BufferObj->Name);
-                  f[k] = 1.0; /* XXX replace the bad value! */
-               }
-               /*assert(!IS_INF_OR_NAN(f[k]));*/
-            }
-         }
-         break;
-      default:
-         ;
-      }
-   }
-}
-
-
-/**
- * Unmap the buffer object referenced by given array, if mapped.
- */
-static void
-unmap_array_buffer(struct gl_context *ctx, struct gl_client_array *array)
-{
-   if (array->Enabled &&
-       _mesa_is_bufferobj(array->BufferObj) &&
-       _mesa_bufferobj_mapped(array->BufferObj)) {
-      ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER_ARB, array->BufferObj);
-   }
-}
-
-
-/**
- * Examine the array's data for NaNs, etc.
- * For debug purposes; not normally used.
- */
-static void
-check_draw_elements_data(struct gl_context *ctx, GLsizei count, GLenum elemType,
-                         const void *elements, GLint basevertex)
-{
-   struct gl_array_object *arrayObj = ctx->Array.ArrayObj;
-   const void *elemMap;
-   GLint i, k;
-
-   if (_mesa_is_bufferobj(ctx->Array.ElementArrayBufferObj)) {
-      elemMap = ctx->Driver.MapBuffer(ctx,
-                                      GL_ELEMENT_ARRAY_BUFFER_ARB,
-                                      GL_READ_ONLY,
-                                      ctx->Array.ElementArrayBufferObj);
-      elements = ADD_POINTERS(elements, elemMap);
-   }
-
-   for (i = 0; i < count; i++) {
-      GLuint j;
-
-      /* j = element[i] */
-      switch (elemType) {
-      case GL_UNSIGNED_BYTE:
-         j = ((const GLubyte *) elements)[i];
-         break;
-      case GL_UNSIGNED_SHORT:
-         j = ((const GLushort *) elements)[i];
-         break;
-      case GL_UNSIGNED_INT:
-         j = ((const GLuint *) elements)[i];
-         break;
-      default:
-         assert(0);
-      }
-
-      /* check element j of each enabled array */
-      check_array_data(ctx, &arrayObj->Vertex, VERT_ATTRIB_POS, j);
-      check_array_data(ctx, &arrayObj->Normal, VERT_ATTRIB_NORMAL, j);
-      check_array_data(ctx, &arrayObj->Color, VERT_ATTRIB_COLOR0, j);
-      check_array_data(ctx, &arrayObj->SecondaryColor, VERT_ATTRIB_COLOR1, j);
-      for (k = 0; k < Elements(arrayObj->TexCoord); k++) {
-         check_array_data(ctx, &arrayObj->TexCoord[k], VERT_ATTRIB_TEX0 + k, j);
-      }
-      for (k = 0; k < Elements(arrayObj->VertexAttrib); k++) {
-         check_array_data(ctx, &arrayObj->VertexAttrib[k],
-                          VERT_ATTRIB_GENERIC0 + k, j);
-      }
-   }
-
-   if (_mesa_is_bufferobj(ctx->Array.ElementArrayBufferObj)) {
-      ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER_ARB,
-			      ctx->Array.ElementArrayBufferObj);
-   }
-
-   unmap_array_buffer(ctx, &arrayObj->Vertex);
-   unmap_array_buffer(ctx, &arrayObj->Normal);
-   unmap_array_buffer(ctx, &arrayObj->Color);
-   for (k = 0; k < Elements(arrayObj->TexCoord); k++) {
-      unmap_array_buffer(ctx, &arrayObj->TexCoord[k]);
-   }
-   for (k = 0; k < Elements(arrayObj->VertexAttrib); k++) {
-      unmap_array_buffer(ctx, &arrayObj->VertexAttrib[k]);
-   }
-}
-
-
-/**
- * Check array data, looking for NaNs, etc.
- */
-static void
-check_draw_arrays_data(struct gl_context *ctx, GLint start, GLsizei count)
-{
-   /* TO DO */
-}
-
-
-/**
- * Print info/data for glDrawArrays(), for debugging.
- */
-static void
-print_draw_arrays(struct gl_context *ctx,
-                  GLenum mode, GLint start, GLsizei count)
-{
-   struct vbo_context *vbo = vbo_context(ctx);
-   struct vbo_exec_context *exec = &vbo->exec;
-   int i;
-
-   printf("vbo_exec_DrawArrays(mode 0x%x, start %d, count %d):\n",
-	  mode, start, count);
-
-   for (i = 0; i < 32; i++) {
-      GLuint bufName = exec->array.inputs[i]->BufferObj->Name;
-      GLint stride = exec->array.inputs[i]->Stride;
-      printf("attr %2d: size %d stride %d  enabled %d  "
-	     "ptr %p  Bufobj %u\n",
-	     i,
-	     exec->array.inputs[i]->Size,
-	     stride,
-	     /*exec->array.inputs[i]->Enabled,*/
-	     exec->array.legacy_array[i]->Enabled,
-	     exec->array.inputs[i]->Ptr,
-	     bufName);
-
-      if (bufName) {
-         struct gl_buffer_object *buf = _mesa_lookup_bufferobj(ctx, bufName);
-         GLubyte *p = ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER_ARB,
-                                            GL_READ_ONLY_ARB, buf);
-         int offset = (int) (GLintptr) exec->array.inputs[i]->Ptr;
-         float *f = (float *) (p + offset);
-         int *k = (int *) f;
-         int i;
-         int n = (count * stride) / 4;
-         if (n > 32)
-            n = 32;
-         printf("  Data at offset %d:\n", offset);
-         for (i = 0; i < n; i++) {
-            printf("    float[%d] = 0x%08x %f\n", i, k[i], f[i]);
-         }
-         ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER_ARB, buf);
-      }
-   }
-}
-
-
-/**
- * Bind the VBO executor to the current vertex array object prior
- * to drawing.
- *
- * Just translate the arrayobj into a sane layout.
- */
-static void
-bind_array_obj(struct gl_context *ctx)
-{
-   struct vbo_context *vbo = vbo_context(ctx);
-   struct vbo_exec_context *exec = &vbo->exec;
-   struct gl_array_object *arrayObj = ctx->Array.ArrayObj;
-   GLuint i;
-
-   /* TODO: Fix the ArrayObj struct to keep legacy arrays in an array
-    * rather than as individual named arrays.  Then this function can
-    * go away.
-    */
-   exec->array.legacy_array[VERT_ATTRIB_POS] = &arrayObj->Vertex;
-   exec->array.legacy_array[VERT_ATTRIB_WEIGHT] = &arrayObj->Weight;
-   exec->array.legacy_array[VERT_ATTRIB_NORMAL] = &arrayObj->Normal;
-   exec->array.legacy_array[VERT_ATTRIB_COLOR0] = &arrayObj->Color;
-   exec->array.legacy_array[VERT_ATTRIB_COLOR1] = &arrayObj->SecondaryColor;
-   exec->array.legacy_array[VERT_ATTRIB_FOG] = &arrayObj->FogCoord;
-   exec->array.legacy_array[VERT_ATTRIB_COLOR_INDEX] = &arrayObj->Index;
-   if (arrayObj->PointSize.Enabled) {
-      /* this aliases COLOR_INDEX */
-      exec->array.legacy_array[VERT_ATTRIB_POINT_SIZE] = &arrayObj->PointSize;
-   }
-   exec->array.legacy_array[VERT_ATTRIB_EDGEFLAG] = &arrayObj->EdgeFlag;
-
-   for (i = 0; i < Elements(arrayObj->TexCoord); i++)
-      exec->array.legacy_array[VERT_ATTRIB_TEX0 + i] = &arrayObj->TexCoord[i];
-
-   for (i = 0; i < Elements(arrayObj->VertexAttrib); i++) {
-      assert(i < Elements(exec->array.generic_array));
-      exec->array.generic_array[i] = &arrayObj->VertexAttrib[i];
-   }
-   
-   exec->array.array_obj = arrayObj->Name;
-}
-
-
-/**
- * Set the vbo->exec->inputs[] pointers to point to the enabled
- * vertex arrays.  This depends on the current vertex program/shader
- * being executed because of whether or not generic vertex arrays
- * alias the conventional vertex arrays.
- * For arrays that aren't enabled, we set the input[attrib] pointer
- * to point at a zero-stride current value "array".
- */
-static void
-recalculate_input_bindings(struct gl_context *ctx)
-{
-   struct vbo_context *vbo = vbo_context(ctx);
-   struct vbo_exec_context *exec = &vbo->exec;
-   const struct gl_client_array **inputs = &exec->array.inputs[0];
-   GLbitfield const_inputs = 0x0;
-   GLuint i;
-
-   exec->array.program_mode = get_program_mode(ctx);
-   exec->array.enabled_flags = ctx->Array.ArrayObj->_Enabled;
-
-   switch (exec->array.program_mode) {
-   case VP_NONE:
-      /* When no vertex program is active (or the vertex program is generated
-       * from fixed-function state).  We put the material values into the
-       * generic slots.  This is the only situation where material values
-       * are available as per-vertex attributes.
-       */
-      for (i = 0; i <= VERT_ATTRIB_TEX7; i++) {
-	 if (exec->array.legacy_array[i]->Enabled)
-	    inputs[i] = exec->array.legacy_array[i];
-	 else {
-	    inputs[i] = &vbo->legacy_currval[i];
-            const_inputs |= 1 << i;
-         }
-      }
-
-      for (i = 0; i < MAT_ATTRIB_MAX; i++) {
-	 inputs[VERT_ATTRIB_GENERIC0 + i] = &vbo->mat_currval[i];
-         const_inputs |= 1 << (VERT_ATTRIB_GENERIC0 + i);
-      }
-
-      /* Could use just about anything, just to fill in the empty
-       * slots:
-       */
-      for (i = MAT_ATTRIB_MAX; i < VERT_ATTRIB_MAX - VERT_ATTRIB_GENERIC0; i++) {
-	 inputs[VERT_ATTRIB_GENERIC0 + i] = &vbo->generic_currval[i];
-         const_inputs |= 1 << (VERT_ATTRIB_GENERIC0 + i);
-      }
-      break;
-
-   case VP_NV:
-      /* NV_vertex_program - attribute arrays alias and override
-       * conventional, legacy arrays.  No materials, and the generic
-       * slots are vacant.
-       */
-      for (i = 0; i <= VERT_ATTRIB_TEX7; i++) {
-	 if (exec->array.generic_array[i]->Enabled)
-	    inputs[i] = exec->array.generic_array[i];
-	 else if (exec->array.legacy_array[i]->Enabled)
-	    inputs[i] = exec->array.legacy_array[i];
-	 else {
-	    inputs[i] = &vbo->legacy_currval[i];
-            const_inputs |= 1 << i;
-         }
-      }
-
-      /* Could use just about anything, just to fill in the empty
-       * slots:
-       */
-      for (i = VERT_ATTRIB_GENERIC0; i < VERT_ATTRIB_MAX; i++) {
-	 inputs[i] = &vbo->generic_currval[i - VERT_ATTRIB_GENERIC0];
-         const_inputs |= 1 << i;
-      }
-      break;
-
-   case VP_ARB:
-      /* GL_ARB_vertex_program or GLSL vertex shader - Only the generic[0]
-       * attribute array aliases and overrides the legacy position array.  
-       *
-       * Otherwise, legacy attributes available in the legacy slots,
-       * generic attributes in the generic slots and materials are not
-       * available as per-vertex attributes.
-       */
-      if (exec->array.generic_array[0]->Enabled)
-	 inputs[0] = exec->array.generic_array[0];
-      else if (exec->array.legacy_array[0]->Enabled)
-	 inputs[0] = exec->array.legacy_array[0];
-      else {
-	 inputs[0] = &vbo->legacy_currval[0];
-         const_inputs |= 1 << 0;
-      }
-
-      for (i = 1; i <= VERT_ATTRIB_TEX7; i++) {
-	 if (exec->array.legacy_array[i]->Enabled)
-	    inputs[i] = exec->array.legacy_array[i];
-	 else {
-	    inputs[i] = &vbo->legacy_currval[i];
-            const_inputs |= 1 << i;
-         }
-      }
-
-      for (i = 0; i < MAX_VERTEX_GENERIC_ATTRIBS; i++) {
-	 if (exec->array.generic_array[i]->Enabled)
-	    inputs[VERT_ATTRIB_GENERIC0 + i] = exec->array.generic_array[i];
-	 else {
-	    inputs[VERT_ATTRIB_GENERIC0 + i] = &vbo->generic_currval[i];
-            const_inputs |= 1 << (VERT_ATTRIB_GENERIC0 + i);
-         }
-
-      }
-      break;
-   }
-
-   _mesa_set_varying_vp_inputs( ctx, ~const_inputs );
-}
-
-
-/**
- * Examine the enabled vertex arrays to set the exec->array.inputs[] values.
- * These will point to the arrays to actually use for drawing.  Some will
- * be user-provided arrays, other will be zero-stride const-valued arrays.
- * Note that this might set the _NEW_ARRAY dirty flag so state validation
- * must be done after this call.
- */
-static void
-bind_arrays(struct gl_context *ctx)
-{
-   bind_array_obj(ctx);
-   recalculate_input_bindings(ctx);
-}
-
-
-/**
- * Helper function called by the other DrawArrays() functions below.
- * This is where we handle primitive restart for drawing non-indexed
- * arrays.  If primitive restart is enabled, it typically means
- * splitting one DrawArrays() into two.
- */
-static void
-vbo_draw_arrays(struct gl_context *ctx, GLenum mode, GLint start,
-                GLsizei count, GLuint numInstances)
-{
-   struct vbo_context *vbo = vbo_context(ctx);
-   struct vbo_exec_context *exec = &vbo->exec;
-   struct _mesa_prim prim[2];
-
-   bind_arrays(ctx);
-
-   /* Again... because we may have changed the bitmask of per-vertex varying
-    * attributes.  If we regenerate the fixed-function vertex program now
-    * we may be able to prune down the number of vertex attributes which we
-    * need in the shader.
-    */
-   if (ctx->NewState)
-      _mesa_update_state(ctx);
-
-   prim[0].begin = 1;
-   prim[0].end = 1;
-   prim[0].weak = 0;
-   prim[0].pad = 0;
-   prim[0].mode = mode;
-   prim[0].start = 0; /* filled in below */
-   prim[0].count = 0; /* filled in below */
-   prim[0].indexed = 0;
-   prim[0].basevertex = 0;
-   prim[0].num_instances = numInstances;
-
-   /* Implement the primitive restart index */
-   if (ctx->Array.PrimitiveRestart && ctx->Array.RestartIndex < count) {
-      GLuint primCount = 0;
-
-      if (ctx->Array.RestartIndex == start) {
-         /* special case: RestartIndex at beginning */
-         if (count > 1) {
-            prim[0].start = start + 1;
-            prim[0].count = count - 1;
-            primCount = 1;
-         }
-      }
-      else if (ctx->Array.RestartIndex == start + count - 1) {
-         /* special case: RestartIndex at end */
-         if (count > 1) {
-            prim[0].start = start;
-            prim[0].count = count - 1;
-            primCount = 1;
-         }
-      }
-      else {
-         /* general case: RestartIndex in middle, split into two prims */
-         prim[0].start = start;
-         prim[0].count = ctx->Array.RestartIndex - start;
-
-         prim[1] = prim[0];
-         prim[1].start = ctx->Array.RestartIndex + 1;
-         prim[1].count = count - prim[1].start;
-
-         primCount = 2;
-      }
-
-      if (primCount > 0) {
-         /* draw one or two prims */
-         vbo->draw_prims(ctx, exec->array.inputs, prim, primCount, NULL,
-                         GL_TRUE, start, start + count - 1);
-      }
-   }
-   else {
-      /* no prim restart */
-      prim[0].start = start;
-      prim[0].count = count;
-
-      vbo->draw_prims(ctx, exec->array.inputs, prim, 1, NULL,
-                      GL_TRUE, start, start + count - 1);
-   }
-}
-
-
-
-/**
- * Called from glDrawArrays when in immediate mode (not display list mode).
- */
-static void GLAPIENTRY
-vbo_exec_DrawArrays(GLenum mode, GLint start, GLsizei count)
-{
-   GET_CURRENT_CONTEXT(ctx);
-
-   if (MESA_VERBOSE & VERBOSE_DRAW)
-      _mesa_debug(ctx, "glDrawArrays(%s, %d, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), start, count);
-
-   if (!_mesa_validate_DrawArrays( ctx, mode, start, count ))
-      return;
-
-   FLUSH_CURRENT( ctx, 0 );
-
-   if (!_mesa_valid_to_render(ctx, "glDrawArrays")) {
-      return;
-   }
-
-   if (0)
-      check_draw_arrays_data(ctx, start, count);
-
-   vbo_draw_arrays(ctx, mode, start, count, 1);
-
-   if (0)
-      print_draw_arrays(ctx, mode, start, count);
-}
-
-
-/**
- * Called from glDrawArraysInstanced when in immediate mode (not
- * display list mode).
- */
-static void GLAPIENTRY
-vbo_exec_DrawArraysInstanced(GLenum mode, GLint start, GLsizei count,
-                             GLsizei numInstances)
-{
-   GET_CURRENT_CONTEXT(ctx);
-
-   if (MESA_VERBOSE & VERBOSE_DRAW)
-      _mesa_debug(ctx, "glDrawArraysInstanced(%s, %d, %d, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), start, count, numInstances);
-
-   if (!_mesa_validate_DrawArraysInstanced(ctx, mode, start, count, numInstances))
-      return;
-
-   FLUSH_CURRENT( ctx, 0 );
-
-   if (!_mesa_valid_to_render(ctx, "glDrawArraysInstanced")) {
-      return;
-   }
-
-   if (0)
-      check_draw_arrays_data(ctx, start, count);
-
-   vbo_draw_arrays(ctx, mode, start, count, numInstances);
-
-   if (0)
-      print_draw_arrays(ctx, mode, start, count);
-}
-
-
-/**
- * Map GL_ELEMENT_ARRAY_BUFFER and print contents.
- * For debugging.
- */
-static void
-dump_element_buffer(struct gl_context *ctx, GLenum type)
-{
-   const GLvoid *map = ctx->Driver.MapBuffer(ctx,
-                                             GL_ELEMENT_ARRAY_BUFFER_ARB,
-                                             GL_READ_ONLY,
-                                             ctx->Array.ElementArrayBufferObj);
-   switch (type) {
-   case GL_UNSIGNED_BYTE:
-      {
-         const GLubyte *us = (const GLubyte *) map;
-         GLint i;
-         for (i = 0; i < ctx->Array.ElementArrayBufferObj->Size; i++) {
-            printf("%02x ", us[i]);
-            if (i % 32 == 31)
-               printf("\n");
-         }
-         printf("\n");
-      }
-      break;
-   case GL_UNSIGNED_SHORT:
-      {
-         const GLushort *us = (const GLushort *) map;
-         GLint i;
-         for (i = 0; i < ctx->Array.ElementArrayBufferObj->Size / 2; i++) {
-            printf("%04x ", us[i]);
-            if (i % 16 == 15)
-               printf("\n");
-         }
-         printf("\n");
-      }
-      break;
-   case GL_UNSIGNED_INT:
-      {
-         const GLuint *us = (const GLuint *) map;
-         GLint i;
-         for (i = 0; i < ctx->Array.ElementArrayBufferObj->Size / 4; i++) {
-            printf("%08x ", us[i]);
-            if (i % 8 == 7)
-               printf("\n");
-         }
-         printf("\n");
-      }
-      break;
-   default:
-      ;
-   }
-
-   ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER_ARB,
-                           ctx->Array.ElementArrayBufferObj);
-}
-
-
-/**
- * Inner support for both _mesa_DrawElements and _mesa_DrawRangeElements.
- * Do the rendering for a glDrawElements or glDrawRangeElements call after
- * we've validated buffer bounds, etc.
- */
-static void
-vbo_validated_drawrangeelements(struct gl_context *ctx, GLenum mode,
-				GLboolean index_bounds_valid,
-				GLuint start, GLuint end,
-				GLsizei count, GLenum type,
-				const GLvoid *indices,
-				GLint basevertex, GLint numInstances)
-{
-   struct vbo_context *vbo = vbo_context(ctx);
-   struct vbo_exec_context *exec = &vbo->exec;
-   struct _mesa_index_buffer ib;
-   struct _mesa_prim prim[1];
-
-   FLUSH_CURRENT( ctx, 0 );
-
-   if (!_mesa_valid_to_render(ctx, "glDraw[Range]Elements")) {
-      return;
-   }
-
-   bind_arrays( ctx );
-
-   /* check for dirty state again */
-   if (ctx->NewState)
-      _mesa_update_state( ctx );
-
-   ib.count = count;
-   ib.type = type;
-   ib.obj = ctx->Array.ElementArrayBufferObj;
-   ib.ptr = indices;
-
-   prim[0].begin = 1;
-   prim[0].end = 1;
-   prim[0].weak = 0;
-   prim[0].pad = 0;
-   prim[0].mode = mode;
-   prim[0].start = 0;
-   prim[0].count = count;
-   prim[0].indexed = 1;
-   prim[0].basevertex = basevertex;
-   prim[0].num_instances = numInstances;
-
-   /* Need to give special consideration to rendering a range of
-    * indices starting somewhere above zero.  Typically the
-    * application is issuing multiple DrawRangeElements() to draw
-    * successive primitives layed out linearly in the vertex arrays.
-    * Unless the vertex arrays are all in a VBO (or locked as with
-    * CVA), the OpenGL semantics imply that we need to re-read or
-    * re-upload the vertex data on each draw call.  
-    *
-    * In the case of hardware tnl, we want to avoid starting the
-    * upload at zero, as it will mean every draw call uploads an
-    * increasing amount of not-used vertex data.  Worse - in the
-    * software tnl module, all those vertices might be transformed and
-    * lit but never rendered.
-    *
-    * If we just upload or transform the vertices in start..end,
-    * however, the indices will be incorrect.
-    *
-    * At this level, we don't know exactly what the requirements of
-    * the backend are going to be, though it will likely boil down to
-    * either:
-    *
-    * 1) Do nothing, everything is in a VBO and is processed once
-    *       only.
-    *
-    * 2) Adjust the indices and vertex arrays so that start becomes
-    *    zero.
-    *
-    * Rather than doing anything here, I'll provide a helper function
-    * for the latter case elsewhere.
-    */
-
-   vbo->draw_prims( ctx, exec->array.inputs, prim, 1, &ib,
-		    index_bounds_valid, start, end );
-}
-
-
-/**
- * Called by glDrawRangeElementsBaseVertex() in immediate mode.
- */
-static void GLAPIENTRY
-vbo_exec_DrawRangeElementsBaseVertex(GLenum mode,
-				     GLuint start, GLuint end,
-				     GLsizei count, GLenum type,
-				     const GLvoid *indices,
-				     GLint basevertex)
-{
-   static GLuint warnCount = 0;
-   GET_CURRENT_CONTEXT(ctx);
-
-   if (MESA_VERBOSE & VERBOSE_DRAW)
-      _mesa_debug(ctx,
-                "glDrawRangeElementsBaseVertex(%s, %u, %u, %d, %s, %p, %d)\n",
-                _mesa_lookup_enum_by_nr(mode), start, end, count,
-                _mesa_lookup_enum_by_nr(type), indices, basevertex);
-
-   if (!_mesa_validate_DrawRangeElements( ctx, mode, start, end, count,
-                                          type, indices, basevertex ))
-      return;
-
-   /* NOTE: It's important that 'end' is a reasonable value.
-    * in _tnl_draw_prims(), we use end to determine how many vertices
-    * to transform.  If it's too large, we can unnecessarily split prims
-    * or we can read/write out of memory in several different places!
-    */
-
-   /* Catch/fix some potential user errors */
-   if (type == GL_UNSIGNED_BYTE) {
-      start = MIN2(start, 0xff);
-      end = MIN2(end, 0xff);
-   }
-   else if (type == GL_UNSIGNED_SHORT) {
-      start = MIN2(start, 0xffff);
-      end = MIN2(end, 0xffff);
-   }
-
-   if (end >= ctx->Array.ArrayObj->_MaxElement) {
-      /* the max element is out of bounds of one or more enabled arrays */
-      warnCount++;
-
-      if (warnCount < 10) {
-         _mesa_warning(ctx, "glDraw[Range]Elements(start %u, end %u, count %d, "
-                       "type 0x%x, indices=%p)\n"
-                       "\tend is out of bounds (max=%u)  "
-                       "Element Buffer %u (size %d)\n"
-                       "\tThis should probably be fixed in the application.",
-                       start, end, count, type, indices,
-                       ctx->Array.ArrayObj->_MaxElement - 1,
-                       ctx->Array.ElementArrayBufferObj->Name,
-                       (int) ctx->Array.ElementArrayBufferObj->Size);
-      }
-
-      if (0)
-         dump_element_buffer(ctx, type);
-
-      if (0)
-         _mesa_print_arrays(ctx);
-
-#ifdef DEBUG
-      /* 'end' was out of bounds, but now let's check the actual array
-       * indexes to see if any of them are out of bounds.
-       */
-      {
-         GLuint max = _mesa_max_buffer_index(ctx, count, type, indices,
-                                             ctx->Array.ElementArrayBufferObj);
-         if (max >= ctx->Array.ArrayObj->_MaxElement) {
-            if (warnCount < 10) {
-               _mesa_warning(ctx, "glDraw[Range]Elements(start %u, end %u, "
-                             "count %d, type 0x%x, indices=%p)\n"
-                             "\tindex=%u is out of bounds (max=%u)  "
-                             "Element Buffer %u (size %d)\n"
-                             "\tSkipping the glDrawRangeElements() call",
-                             start, end, count, type, indices, max,
-                             ctx->Array.ArrayObj->_MaxElement - 1,
-                             ctx->Array.ElementArrayBufferObj->Name,
-                             (int) ctx->Array.ElementArrayBufferObj->Size);
-            }
-         }
-         /* XXX we could also find the min index and compare to 'start'
-          * to see if start is correct.  But it's more likely to get the
-          * upper bound wrong.
-          */
-      }
-#endif
-
-      /* Set 'end' to the max possible legal value */
-      assert(ctx->Array.ArrayObj->_MaxElement >= 1);
-      end = ctx->Array.ArrayObj->_MaxElement - 1;
-   }
-   else if (0) {
-      printf("glDraw[Range]Elements{,BaseVertex}"
-	     "(start %u, end %u, type 0x%x, count %d) ElemBuf %u, "
-	     "base %d\n",
-	     start, end, type, count,
-	     ctx->Array.ElementArrayBufferObj->Name,
-	     basevertex);
-   }
-
-#if 0
-   check_draw_elements_data(ctx, count, type, indices);
-#else
-   (void) check_draw_elements_data;
-#endif
-
-   vbo_validated_drawrangeelements(ctx, mode, GL_TRUE, start, end,
-				   count, type, indices, basevertex, 1);
-}
-
-
-/**
- * Called by glDrawRangeElements() in immediate mode.
- */
-static void GLAPIENTRY
-vbo_exec_DrawRangeElements(GLenum mode, GLuint start, GLuint end,
-                           GLsizei count, GLenum type, const GLvoid *indices)
-{
-   GET_CURRENT_CONTEXT(ctx);
-
-   if (MESA_VERBOSE & VERBOSE_DRAW)
-      _mesa_debug(ctx,
-                  "glDrawRangeElements(%s, %u, %u, %d, %s, %p)\n",
-                  _mesa_lookup_enum_by_nr(mode), start, end, count,
-                  _mesa_lookup_enum_by_nr(type), indices);
-
-   vbo_exec_DrawRangeElementsBaseVertex(mode, start, end, count, type,
-					indices, 0);
-}
-
-
-/**
- * Called by glDrawElements() in immediate mode.
- */
-static void GLAPIENTRY
-vbo_exec_DrawElements(GLenum mode, GLsizei count, GLenum type,
-                      const GLvoid *indices)
-{
-   GET_CURRENT_CONTEXT(ctx);
-
-   if (MESA_VERBOSE & VERBOSE_DRAW)
-      _mesa_debug(ctx, "glDrawElements(%s, %u, %s, %p)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices);
-
-   if (!_mesa_validate_DrawElements( ctx, mode, count, type, indices, 0 ))
-      return;
-
-   vbo_validated_drawrangeelements(ctx, mode, GL_FALSE, ~0, ~0,
-				   count, type, indices, 0, 1);
-}
-
-
-/**
- * Called by glDrawElementsBaseVertex() in immediate mode.
- */
-static void GLAPIENTRY
-vbo_exec_DrawElementsBaseVertex(GLenum mode, GLsizei count, GLenum type,
-				const GLvoid *indices, GLint basevertex)
-{
-   GET_CURRENT_CONTEXT(ctx);
-
-   if (MESA_VERBOSE & VERBOSE_DRAW)
-      _mesa_debug(ctx, "glDrawElementsBaseVertex(%s, %d, %s, %p, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices, basevertex);
-
-   if (!_mesa_validate_DrawElements( ctx, mode, count, type, indices,
-				     basevertex ))
-      return;
-
-   vbo_validated_drawrangeelements(ctx, mode, GL_FALSE, ~0, ~0,
-				   count, type, indices, basevertex, 1);
-}
-
-
-/**
- * Called by glDrawElementsInstanced() in immediate mode.
- */
-static void GLAPIENTRY
-vbo_exec_DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type,
-                               const GLvoid *indices, GLsizei numInstances)
-{
-   GET_CURRENT_CONTEXT(ctx);
-
-   if (MESA_VERBOSE & VERBOSE_DRAW)
-      _mesa_debug(ctx, "glDrawElementsInstanced(%s, %d, %s, %p, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices, numInstances);
-
-   if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type, indices,
-                                             numInstances))
-      return;
-
-   vbo_validated_drawrangeelements(ctx, mode, GL_FALSE, ~0, ~0,
-				   count, type, indices, 0, numInstances);
-}
-
-
-/**
- * Inner support for both _mesa_MultiDrawElements() and
- * _mesa_MultiDrawRangeElements().
- * This does the actual rendering after we've checked array indexes, etc.
- */
-static void
-vbo_validated_multidrawelements(struct gl_context *ctx, GLenum mode,
-				const GLsizei *count, GLenum type,
-				const GLvoid **indices, GLsizei primcount,
-				const GLint *basevertex)
-{
-   struct vbo_context *vbo = vbo_context(ctx);
-   struct vbo_exec_context *exec = &vbo->exec;
-   struct _mesa_index_buffer ib;
-   struct _mesa_prim *prim;
-   unsigned int index_type_size = 0;
-   uintptr_t min_index_ptr, max_index_ptr;
-   GLboolean fallback = GL_FALSE;
-   int i;
-
-   if (primcount == 0)
-      return;
-
-   FLUSH_CURRENT( ctx, 0 );
-
-   if (!_mesa_valid_to_render(ctx, "glMultiDrawElements")) {
-      return;
-   }
-
-   prim = calloc(1, primcount * sizeof(*prim));
-   if (prim == NULL) {
-      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glMultiDrawElements");
-      return;
-   }
-
-   /* Decide if we can do this all as one set of primitives sharing the
-    * same index buffer, or if we have to reset the index pointer per
-    * primitive.
-    */
-   bind_arrays( ctx );
-
-   /* check for dirty state again */
-   if (ctx->NewState)
-      _mesa_update_state( ctx );
-
-   switch (type) {
-   case GL_UNSIGNED_INT:
-      index_type_size = 4;
-      break;
-   case GL_UNSIGNED_SHORT:
-      index_type_size = 2;
-      break;
-   case GL_UNSIGNED_BYTE:
-      index_type_size = 1;
-      break;
-   default:
-      assert(0);
-   }
-
-   min_index_ptr = (uintptr_t)indices[0];
-   max_index_ptr = 0;
-   for (i = 0; i < primcount; i++) {
-      min_index_ptr = MIN2(min_index_ptr, (uintptr_t)indices[i]);
-      max_index_ptr = MAX2(max_index_ptr, (uintptr_t)indices[i] +
-			   index_type_size * count[i]);
-   }
-
-   /* Check if we can handle this thing as a bunch of index offsets from the
-    * same index pointer.  If we can't, then we have to fall back to doing
-    * a draw_prims per primitive.
-    * Check that the difference between each prim's indexes is a multiple of
-    * the index/element size.
-    */
-   if (index_type_size != 1) {
-      for (i = 0; i < primcount; i++) {
-	 if ((((uintptr_t)indices[i] - min_index_ptr) % index_type_size) != 0) {
-	    fallback = GL_TRUE;
-	    break;
-	 }
-      }
-   }
-
-   /* If the index buffer isn't in a VBO, then treating the application's
-    * subranges of the index buffer as one large index buffer may lead to
-    * us reading unmapped memory.
-    */
-   if (!_mesa_is_bufferobj(ctx->Array.ElementArrayBufferObj))
-      fallback = GL_TRUE;
-
-   if (!fallback) {
-      ib.count = (max_index_ptr - min_index_ptr) / index_type_size;
-      ib.type = type;
-      ib.obj = ctx->Array.ElementArrayBufferObj;
-      ib.ptr = (void *)min_index_ptr;
-
-      for (i = 0; i < primcount; i++) {
-	 prim[i].begin = (i == 0);
-	 prim[i].end = (i == primcount - 1);
-	 prim[i].weak = 0;
-	 prim[i].pad = 0;
-	 prim[i].mode = mode;
-	 prim[i].start = ((uintptr_t)indices[i] - min_index_ptr) / index_type_size;
-	 prim[i].count = count[i];
-	 prim[i].indexed = 1;
-         prim[i].num_instances = 1;
-	 if (basevertex != NULL)
-	    prim[i].basevertex = basevertex[i];
-	 else
-	    prim[i].basevertex = 0;
-      }
-
-      vbo->draw_prims(ctx, exec->array.inputs, prim, primcount, &ib,
-		      GL_FALSE, ~0, ~0);
-   } else {
-      /* render one prim at a time */
-      for (i = 0; i < primcount; i++) {
-	 ib.count = count[i];
-	 ib.type = type;
-	 ib.obj = ctx->Array.ElementArrayBufferObj;
-	 ib.ptr = indices[i];
-
-	 prim[0].begin = 1;
-	 prim[0].end = 1;
-	 prim[0].weak = 0;
-	 prim[0].pad = 0;
-	 prim[0].mode = mode;
-	 prim[0].start = 0;
-	 prim[0].count = count[i];
-	 prim[0].indexed = 1;
-         prim[0].num_instances = 1;
-	 if (basevertex != NULL)
-	    prim[0].basevertex = basevertex[i];
-	 else
-	    prim[0].basevertex = 0;
-
-         vbo->draw_prims(ctx, exec->array.inputs, prim, 1, &ib,
-                         GL_FALSE, ~0, ~0);
-      }
-   }
-
-   free(prim);
-}
-
-
-static void GLAPIENTRY
-vbo_exec_MultiDrawElements(GLenum mode,
-			   const GLsizei *count, GLenum type,
-			   const GLvoid **indices,
-			   GLsizei primcount)
-{
-   GET_CURRENT_CONTEXT(ctx);
-   GLint i;
-
-   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
-
-   for (i = 0; i < primcount; i++) {
-      if (!_mesa_validate_DrawElements(ctx, mode, count[i], type, indices[i],
-				       0))
-	 return;
-   }
-
-   vbo_validated_multidrawelements(ctx, mode, count, type, indices, primcount,
-				   NULL);
-}
-
-
-static void GLAPIENTRY
-vbo_exec_MultiDrawElementsBaseVertex(GLenum mode,
-				     const GLsizei *count, GLenum type,
-				     const GLvoid **indices,
-				     GLsizei primcount,
-				     const GLsizei *basevertex)
-{
-   GET_CURRENT_CONTEXT(ctx);
-   GLint i;
-
-   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
-
-   for (i = 0; i < primcount; i++) {
-      if (!_mesa_validate_DrawElements(ctx, mode, count[i], type, indices[i],
-				       basevertex[i]))
-	 return;
-   }
-
-   vbo_validated_multidrawelements(ctx, mode, count, type, indices, primcount,
-				   basevertex);
-}
-
-
-/**
- * Plug in the immediate-mode vertex array drawing commands into the
- * givven vbo_exec_context object.
- */
-void
-vbo_exec_array_init( struct vbo_exec_context *exec )
-{
-   exec->vtxfmt.DrawArrays = vbo_exec_DrawArrays;
-   exec->vtxfmt.DrawElements = vbo_exec_DrawElements;
-   exec->vtxfmt.DrawRangeElements = vbo_exec_DrawRangeElements;
-   exec->vtxfmt.MultiDrawElementsEXT = vbo_exec_MultiDrawElements;
-   exec->vtxfmt.DrawElementsBaseVertex = vbo_exec_DrawElementsBaseVertex;
-   exec->vtxfmt.DrawRangeElementsBaseVertex = vbo_exec_DrawRangeElementsBaseVertex;
-   exec->vtxfmt.MultiDrawElementsBaseVertex = vbo_exec_MultiDrawElementsBaseVertex;
-   exec->vtxfmt.DrawArraysInstanced = vbo_exec_DrawArraysInstanced;
-   exec->vtxfmt.DrawElementsInstanced = vbo_exec_DrawElementsInstanced;
-}
-
-
-void
-vbo_exec_array_destroy( struct vbo_exec_context *exec )
-{
-   /* nothing to do */
-}
-
-
-
-/**
- * The following functions are only used for OpenGL ES 1/2 support.
- * And some aren't even supported (yet) in ES 1/2.
- */
-
-
-void GLAPIENTRY
-_mesa_DrawArrays(GLenum mode, GLint first, GLsizei count)
-{
-   vbo_exec_DrawArrays(mode, first, count);
-}
-
-
-void GLAPIENTRY
-_mesa_DrawElements(GLenum mode, GLsizei count, GLenum type,
-                   const GLvoid *indices)
-{
-   vbo_exec_DrawElements(mode, count, type, indices);
-}
-
-
-void GLAPIENTRY
-_mesa_DrawElementsBaseVertex(GLenum mode, GLsizei count, GLenum type,
-			     const GLvoid *indices, GLint basevertex)
-{
-   vbo_exec_DrawElementsBaseVertex(mode, count, type, indices, basevertex);
-}
-
-
-void GLAPIENTRY
-_mesa_DrawRangeElements(GLenum mode, GLuint start, GLuint end, GLsizei count,
-                        GLenum type, const GLvoid *indices)
-{
-   vbo_exec_DrawRangeElements(mode, start, end, count, type, indices);
-}
-
-
-void GLAPIENTRY
-_mesa_DrawRangeElementsBaseVertex(GLenum mode, GLuint start, GLuint end,
-				  GLsizei count, GLenum type,
-				  const GLvoid *indices, GLint basevertex)
-{
-   vbo_exec_DrawRangeElementsBaseVertex(mode, start, end, count, type,
-					indices, basevertex);
-}
-
-
-void GLAPIENTRY
-_mesa_MultiDrawElementsEXT(GLenum mode, const GLsizei *count, GLenum type,
-			   const GLvoid **indices, GLsizei primcount)
-{
-   vbo_exec_MultiDrawElements(mode, count, type, indices, primcount);
-}
-
-
-void GLAPIENTRY
-_mesa_MultiDrawElementsBaseVertex(GLenum mode,
-				  const GLsizei *count, GLenum type,
-				  const GLvoid **indices, GLsizei primcount,
-				  const GLint *basevertex)
-{
-   vbo_exec_MultiDrawElementsBaseVertex(mode, count, type, indices,
-					primcount, basevertex);
-}
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "main/state.h"
+#include "main/api_validate.h"
+#include "main/varray.h"
+#include "main/bufferobj.h"
+#include "main/enums.h"
+#include "main/macros.h"
+
+#include "vbo_context.h"
+
+
+/**
+ * Compute min and max elements by scanning the index buffer for
+ * glDraw[Range]Elements() calls.
+ * If primitive restart is enabled, we need to ignore restart
+ * indexes when computing min/max.
+ */
+void
+vbo_get_minmax_index(struct gl_context *ctx,
+		     const struct _mesa_prim *prim,
+		     const struct _mesa_index_buffer *ib,
+		     GLuint *min_index, GLuint *max_index)
+{
+   const GLboolean restart = ctx->Array.PrimitiveRestart;
+   const GLuint restartIndex = ctx->Array.RestartIndex;
+   const GLuint count = prim->count;
+   const void *indices;
+   GLuint i;
+
+   if (_mesa_is_bufferobj(ib->obj)) {
+      const GLvoid *map =
+         ctx->Driver.MapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER_ARB,
+                               GL_READ_ONLY, ib->obj);
+      indices = ADD_POINTERS(map, ib->ptr);
+   } else {
+      indices = ib->ptr;
+   }
+
+   switch (ib->type) {
+   case GL_UNSIGNED_INT: {
+      const GLuint *ui_indices = (const GLuint *)indices;
+      GLuint max_ui = 0;
+      GLuint min_ui = ~0U;
+      if (restart) {
+         for (i = 0; i < count; i++) {
+            if (ui_indices[i] != restartIndex) {
+               if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
+               if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
+            }
+         }
+      }
+      else {
+         for (i = 0; i < count; i++) {
+            if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
+            if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
+         }
+      }
+      *min_index = min_ui;
+      *max_index = max_ui;
+      break;
+   }
+   case GL_UNSIGNED_SHORT: {
+      const GLushort *us_indices = (const GLushort *)indices;
+      GLuint max_us = 0;
+      GLuint min_us = ~0U;
+      if (restart) {
+         for (i = 0; i < count; i++) {
+            if (us_indices[i] != restartIndex) {
+               if (us_indices[i] > max_us) max_us = us_indices[i];
+               if (us_indices[i] < min_us) min_us = us_indices[i];
+            }
+         }
+      }
+      else {
+         for (i = 0; i < count; i++) {
+            if (us_indices[i] > max_us) max_us = us_indices[i];
+            if (us_indices[i] < min_us) min_us = us_indices[i];
+         }
+      }
+      *min_index = min_us;
+      *max_index = max_us;
+      break;
+   }
+   case GL_UNSIGNED_BYTE: {
+      const GLubyte *ub_indices = (const GLubyte *)indices;
+      GLuint max_ub = 0;
+      GLuint min_ub = ~0U;
+      if (restart) {
+         for (i = 0; i < count; i++) {
+            if (ub_indices[i] != restartIndex) {
+               if (ub_indices[i] > max_ub) max_ub = ub_indices[i];
+               if (ub_indices[i] < min_ub) min_ub = ub_indices[i];
+            }
+         }
+      }
+      else {
+         for (i = 0; i < count; i++) {
+            if (ub_indices[i] > max_ub) max_ub = ub_indices[i];
+            if (ub_indices[i] < min_ub) min_ub = ub_indices[i];
+         }
+      }
+      *min_index = min_ub;
+      *max_index = max_ub;
+      break;
+   }
+   default:
+      assert(0);
+      break;
+   }
+
+   if (_mesa_is_bufferobj(ib->obj)) {
+      ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER_ARB, ib->obj);
+   }
+}
+
+
+/**
+ * Check that element 'j' of the array has reasonable data.
+ * Map VBO if needed.
+ * For debugging purposes; not normally used.
+ */
+static void
+check_array_data(struct gl_context *ctx, struct gl_client_array *array,
+                 GLuint attrib, GLuint j)
+{
+   if (array->Enabled) {
+      const void *data = array->Ptr;
+      if (_mesa_is_bufferobj(array->BufferObj)) {
+         if (!array->BufferObj->Pointer) {
+            /* need to map now */
+            array->BufferObj->Pointer =
+               ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER_ARB,
+                                     GL_READ_ONLY, array->BufferObj);
+         }
+         data = ADD_POINTERS(data, array->BufferObj->Pointer);
+      }
+      switch (array->Type) {
+      case GL_FLOAT:
+         {
+            GLfloat *f = (GLfloat *) ((GLubyte *) data + array->StrideB * j);
+            GLint k;
+            for (k = 0; k < array->Size; k++) {
+               if (IS_INF_OR_NAN(f[k]) ||
+                   f[k] >= 1.0e20 || f[k] <= -1.0e10) {
+                  printf("Bad array data:\n");
+                  printf("  Element[%u].%u = %f\n", j, k, f[k]);
+                  printf("  Array %u at %p\n", attrib, (void* ) array);
+                  printf("  Type 0x%x, Size %d, Stride %d\n",
+			 array->Type, array->Size, array->Stride);
+                  printf("  Address/offset %p in Buffer Object %u\n",
+			 array->Ptr, array->BufferObj->Name);
+                  f[k] = 1.0; /* XXX replace the bad value! */
+               }
+               /*assert(!IS_INF_OR_NAN(f[k]));*/
+            }
+         }
+         break;
+      default:
+         ;
+      }
+   }
+}
+
+
+/**
+ * Unmap the buffer object referenced by given array, if mapped.
+ */
+static void
+unmap_array_buffer(struct gl_context *ctx, struct gl_client_array *array)
+{
+   if (array->Enabled &&
+       _mesa_is_bufferobj(array->BufferObj) &&
+       _mesa_bufferobj_mapped(array->BufferObj)) {
+      ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER_ARB, array->BufferObj);
+   }
+}
+
+
+/**
+ * Examine the array's data for NaNs, etc.
+ * For debug purposes; not normally used.
+ */
+static void
+check_draw_elements_data(struct gl_context *ctx, GLsizei count, GLenum elemType,
+                         const void *elements, GLint basevertex)
+{
+   struct gl_array_object *arrayObj = ctx->Array.ArrayObj;
+   const void *elemMap;
+   GLint i, k;
+
+   if (_mesa_is_bufferobj(ctx->Array.ElementArrayBufferObj)) {
+      elemMap = ctx->Driver.MapBuffer(ctx,
+                                      GL_ELEMENT_ARRAY_BUFFER_ARB,
+                                      GL_READ_ONLY,
+                                      ctx->Array.ElementArrayBufferObj);
+      elements = ADD_POINTERS(elements, elemMap);
+   }
+
+   for (i = 0; i < count; i++) {
+      GLuint j;
+
+      /* j = element[i] */
+      switch (elemType) {
+      case GL_UNSIGNED_BYTE:
+         j = ((const GLubyte *) elements)[i];
+         break;
+      case GL_UNSIGNED_SHORT:
+         j = ((const GLushort *) elements)[i];
+         break;
+      case GL_UNSIGNED_INT:
+         j = ((const GLuint *) elements)[i];
+         break;
+      default:
+         assert(0);
+      }
+
+      /* check element j of each enabled array */
+      check_array_data(ctx, &arrayObj->Vertex, VERT_ATTRIB_POS, j);
+      check_array_data(ctx, &arrayObj->Normal, VERT_ATTRIB_NORMAL, j);
+      check_array_data(ctx, &arrayObj->Color, VERT_ATTRIB_COLOR0, j);
+      check_array_data(ctx, &arrayObj->SecondaryColor, VERT_ATTRIB_COLOR1, j);
+      for (k = 0; k < Elements(arrayObj->TexCoord); k++) {
+         check_array_data(ctx, &arrayObj->TexCoord[k], VERT_ATTRIB_TEX0 + k, j);
+      }
+      for (k = 0; k < Elements(arrayObj->VertexAttrib); k++) {
+         check_array_data(ctx, &arrayObj->VertexAttrib[k],
+                          VERT_ATTRIB_GENERIC0 + k, j);
+      }
+   }
+
+   if (_mesa_is_bufferobj(ctx->Array.ElementArrayBufferObj)) {
+      ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER_ARB,
+			      ctx->Array.ElementArrayBufferObj);
+   }
+
+   unmap_array_buffer(ctx, &arrayObj->Vertex);
+   unmap_array_buffer(ctx, &arrayObj->Normal);
+   unmap_array_buffer(ctx, &arrayObj->Color);
+   for (k = 0; k < Elements(arrayObj->TexCoord); k++) {
+      unmap_array_buffer(ctx, &arrayObj->TexCoord[k]);
+   }
+   for (k = 0; k < Elements(arrayObj->VertexAttrib); k++) {
+      unmap_array_buffer(ctx, &arrayObj->VertexAttrib[k]);
+   }
+}
+
+
+/**
+ * Check array data, looking for NaNs, etc.
+ */
+static void
+check_draw_arrays_data(struct gl_context *ctx, GLint start, GLsizei count)
+{
+   /* TO DO */
+}
+
+
+/**
+ * Print info/data for glDrawArrays(), for debugging.
+ */
+static void
+print_draw_arrays(struct gl_context *ctx,
+                  GLenum mode, GLint start, GLsizei count)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   struct vbo_exec_context *exec = &vbo->exec;
+   int i;
+
+   printf("vbo_exec_DrawArrays(mode 0x%x, start %d, count %d):\n",
+	  mode, start, count);
+
+   for (i = 0; i < 32; i++) {
+      GLuint bufName = exec->array.inputs[i]->BufferObj->Name;
+      GLint stride = exec->array.inputs[i]->Stride;
+      printf("attr %2d: size %d stride %d  enabled %d  "
+	     "ptr %p  Bufobj %u\n",
+	     i,
+	     exec->array.inputs[i]->Size,
+	     stride,
+	     /*exec->array.inputs[i]->Enabled,*/
+	     exec->array.legacy_array[i]->Enabled,
+	     exec->array.inputs[i]->Ptr,
+	     bufName);
+
+      if (bufName) {
+         struct gl_buffer_object *buf = _mesa_lookup_bufferobj(ctx, bufName);
+         GLubyte *p = ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER_ARB,
+                                            GL_READ_ONLY_ARB, buf);
+         int offset = (int) (GLintptr) exec->array.inputs[i]->Ptr;
+         float *f = (float *) (p + offset);
+         int *k = (int *) f;
+         int i;
+         int n = (count * stride) / 4;
+         if (n > 32)
+            n = 32;
+         printf("  Data at offset %d:\n", offset);
+         for (i = 0; i < n; i++) {
+            printf("    float[%d] = 0x%08x %f\n", i, k[i], f[i]);
+         }
+         ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER_ARB, buf);
+      }
+   }
+}
+
+
+/**
+ * Bind the VBO executor to the current vertex array object prior
+ * to drawing.
+ *
+ * Just translate the arrayobj into a sane layout.
+ */
+static void
+bind_array_obj(struct gl_context *ctx)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   struct vbo_exec_context *exec = &vbo->exec;
+   struct gl_array_object *arrayObj = ctx->Array.ArrayObj;
+   GLuint i;
+
+   /* TODO: Fix the ArrayObj struct to keep legacy arrays in an array
+    * rather than as individual named arrays.  Then this function can
+    * go away.
+    */
+   exec->array.legacy_array[VERT_ATTRIB_POS] = &arrayObj->Vertex;
+   exec->array.legacy_array[VERT_ATTRIB_WEIGHT] = &arrayObj->Weight;
+   exec->array.legacy_array[VERT_ATTRIB_NORMAL] = &arrayObj->Normal;
+   exec->array.legacy_array[VERT_ATTRIB_COLOR0] = &arrayObj->Color;
+   exec->array.legacy_array[VERT_ATTRIB_COLOR1] = &arrayObj->SecondaryColor;
+   exec->array.legacy_array[VERT_ATTRIB_FOG] = &arrayObj->FogCoord;
+   exec->array.legacy_array[VERT_ATTRIB_COLOR_INDEX] = &arrayObj->Index;
+   if (arrayObj->PointSize.Enabled) {
+      /* this aliases COLOR_INDEX */
+      exec->array.legacy_array[VERT_ATTRIB_POINT_SIZE] = &arrayObj->PointSize;
+   }
+   exec->array.legacy_array[VERT_ATTRIB_EDGEFLAG] = &arrayObj->EdgeFlag;
+
+   for (i = 0; i < Elements(arrayObj->TexCoord); i++)
+      exec->array.legacy_array[VERT_ATTRIB_TEX0 + i] = &arrayObj->TexCoord[i];
+
+   for (i = 0; i < Elements(arrayObj->VertexAttrib); i++) {
+      assert(i < Elements(exec->array.generic_array));
+      exec->array.generic_array[i] = &arrayObj->VertexAttrib[i];
+   }
+   
+   exec->array.array_obj = arrayObj->Name;
+}
+
+
+/**
+ * Set the vbo->exec->inputs[] pointers to point to the enabled
+ * vertex arrays.  This depends on the current vertex program/shader
+ * being executed because of whether or not generic vertex arrays
+ * alias the conventional vertex arrays.
+ * For arrays that aren't enabled, we set the input[attrib] pointer
+ * to point at a zero-stride current value "array".
+ */
+static void
+recalculate_input_bindings(struct gl_context *ctx)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   struct vbo_exec_context *exec = &vbo->exec;
+   const struct gl_client_array **inputs = &exec->array.inputs[0];
+   GLbitfield const_inputs = 0x0;
+   GLuint i;
+
+   exec->array.program_mode = get_program_mode(ctx);
+   exec->array.enabled_flags = ctx->Array.ArrayObj->_Enabled;
+
+   switch (exec->array.program_mode) {
+   case VP_NONE:
+      /* When no vertex program is active (or the vertex program is generated
+       * from fixed-function state).  We put the material values into the
+       * generic slots.  This is the only situation where material values
+       * are available as per-vertex attributes.
+       */
+      for (i = 0; i <= VERT_ATTRIB_TEX7; i++) {
+	 if (exec->array.legacy_array[i]->Enabled)
+	    inputs[i] = exec->array.legacy_array[i];
+	 else {
+	    inputs[i] = &vbo->legacy_currval[i];
+            const_inputs |= 1 << i;
+         }
+      }
+
+      for (i = 0; i < MAT_ATTRIB_MAX; i++) {
+	 inputs[VERT_ATTRIB_GENERIC0 + i] = &vbo->mat_currval[i];
+         const_inputs |= 1 << (VERT_ATTRIB_GENERIC0 + i);
+      }
+
+      /* Could use just about anything, just to fill in the empty
+       * slots:
+       */
+      for (i = MAT_ATTRIB_MAX; i < VERT_ATTRIB_MAX - VERT_ATTRIB_GENERIC0; i++) {
+	 inputs[VERT_ATTRIB_GENERIC0 + i] = &vbo->generic_currval[i];
+         const_inputs |= 1 << (VERT_ATTRIB_GENERIC0 + i);
+      }
+      break;
+
+   case VP_NV:
+      /* NV_vertex_program - attribute arrays alias and override
+       * conventional, legacy arrays.  No materials, and the generic
+       * slots are vacant.
+       */
+      for (i = 0; i <= VERT_ATTRIB_TEX7; i++) {
+	 if (exec->array.generic_array[i]->Enabled)
+	    inputs[i] = exec->array.generic_array[i];
+	 else if (exec->array.legacy_array[i]->Enabled)
+	    inputs[i] = exec->array.legacy_array[i];
+	 else {
+	    inputs[i] = &vbo->legacy_currval[i];
+            const_inputs |= 1 << i;
+         }
+      }
+
+      /* Could use just about anything, just to fill in the empty
+       * slots:
+       */
+      for (i = VERT_ATTRIB_GENERIC0; i < VERT_ATTRIB_MAX; i++) {
+	 inputs[i] = &vbo->generic_currval[i - VERT_ATTRIB_GENERIC0];
+         const_inputs |= 1 << i;
+      }
+      break;
+
+   case VP_ARB:
+      /* GL_ARB_vertex_program or GLSL vertex shader - Only the generic[0]
+       * attribute array aliases and overrides the legacy position array.  
+       *
+       * Otherwise, legacy attributes available in the legacy slots,
+       * generic attributes in the generic slots and materials are not
+       * available as per-vertex attributes.
+       */
+      if (exec->array.generic_array[0]->Enabled)
+	 inputs[0] = exec->array.generic_array[0];
+      else if (exec->array.legacy_array[0]->Enabled)
+	 inputs[0] = exec->array.legacy_array[0];
+      else {
+	 inputs[0] = &vbo->legacy_currval[0];
+         const_inputs |= 1 << 0;
+      }
+
+      for (i = 1; i <= VERT_ATTRIB_TEX7; i++) {
+	 if (exec->array.legacy_array[i]->Enabled)
+	    inputs[i] = exec->array.legacy_array[i];
+	 else {
+	    inputs[i] = &vbo->legacy_currval[i];
+            const_inputs |= 1 << i;
+         }
+      }
+
+      for (i = 0; i < MAX_VERTEX_GENERIC_ATTRIBS; i++) {
+	 if (exec->array.generic_array[i]->Enabled)
+	    inputs[VERT_ATTRIB_GENERIC0 + i] = exec->array.generic_array[i];
+	 else {
+	    inputs[VERT_ATTRIB_GENERIC0 + i] = &vbo->generic_currval[i];
+            const_inputs |= 1 << (VERT_ATTRIB_GENERIC0 + i);
+         }
+
+      }
+      break;
+   }
+
+   _mesa_set_varying_vp_inputs( ctx, ~const_inputs );
+}
+
+
+/**
+ * Examine the enabled vertex arrays to set the exec->array.inputs[] values.
+ * These will point to the arrays to actually use for drawing.  Some will
+ * be user-provided arrays, other will be zero-stride const-valued arrays.
+ * Note that this might set the _NEW_ARRAY dirty flag so state validation
+ * must be done after this call.
+ */
+static void
+bind_arrays(struct gl_context *ctx)
+{
+   if (!ctx->Array.RebindArrays) {
+      return;
+   }
+
+   bind_array_obj(ctx);
+   recalculate_input_bindings(ctx);
+   ctx->Array.RebindArrays = GL_FALSE;
+}
+
+
+/**
+ * Helper function called by the other DrawArrays() functions below.
+ * This is where we handle primitive restart for drawing non-indexed
+ * arrays.  If primitive restart is enabled, it typically means
+ * splitting one DrawArrays() into two.
+ */
+static void
+vbo_draw_arrays(struct gl_context *ctx, GLenum mode, GLint start,
+                GLsizei count, GLuint numInstances)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   struct vbo_exec_context *exec = &vbo->exec;
+   struct _mesa_prim prim[2];
+
+   bind_arrays(ctx);
+
+   /* Again... because we may have changed the bitmask of per-vertex varying
+    * attributes.  If we regenerate the fixed-function vertex program now
+    * we may be able to prune down the number of vertex attributes which we
+    * need in the shader.
+    */
+   if (ctx->NewState)
+      _mesa_update_state(ctx);
+
+   prim[0].begin = 1;
+   prim[0].end = 1;
+   prim[0].weak = 0;
+   prim[0].pad = 0;
+   prim[0].mode = mode;
+   prim[0].start = 0; /* filled in below */
+   prim[0].count = 0; /* filled in below */
+   prim[0].indexed = 0;
+   prim[0].basevertex = 0;
+   prim[0].num_instances = numInstances;
+
+   /* Implement the primitive restart index */
+   if (ctx->Array.PrimitiveRestart && ctx->Array.RestartIndex < count) {
+      GLuint primCount = 0;
+
+      if (ctx->Array.RestartIndex == start) {
+         /* special case: RestartIndex at beginning */
+         if (count > 1) {
+            prim[0].start = start + 1;
+            prim[0].count = count - 1;
+            primCount = 1;
+         }
+      }
+      else if (ctx->Array.RestartIndex == start + count - 1) {
+         /* special case: RestartIndex at end */
+         if (count > 1) {
+            prim[0].start = start;
+            prim[0].count = count - 1;
+            primCount = 1;
+         }
+      }
+      else {
+         /* general case: RestartIndex in middle, split into two prims */
+         prim[0].start = start;
+         prim[0].count = ctx->Array.RestartIndex - start;
+
+         prim[1] = prim[0];
+         prim[1].start = ctx->Array.RestartIndex + 1;
+         prim[1].count = count - prim[1].start;
+
+         primCount = 2;
+      }
+
+      if (primCount > 0) {
+         /* draw one or two prims */
+         vbo->draw_prims(ctx, exec->array.inputs, prim, primCount, NULL,
+                         GL_TRUE, start, start + count - 1);
+      }
+   }
+   else {
+      /* no prim restart */
+      prim[0].start = start;
+      prim[0].count = count;
+
+      vbo->draw_prims(ctx, exec->array.inputs, prim, 1, NULL,
+                      GL_TRUE, start, start + count - 1);
+   }
+}
+
+
+
+/**
+ * Called from glDrawArrays when in immediate mode (not display list mode).
+ */
+static void GLAPIENTRY
+vbo_exec_DrawArrays(GLenum mode, GLint start, GLsizei count)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_DRAW)
+      _mesa_debug(ctx, "glDrawArrays(%s, %d, %d)\n",
+                  _mesa_lookup_enum_by_nr(mode), start, count);
+
+   if (!_mesa_validate_DrawArrays( ctx, mode, start, count ))
+      return;
+
+   FLUSH_CURRENT( ctx, 0 );
+
+   if (!_mesa_valid_to_render(ctx, "glDrawArrays")) {
+      return;
+   }
+
+   if (0)
+      check_draw_arrays_data(ctx, start, count);
+
+   vbo_draw_arrays(ctx, mode, start, count, 1);
+
+   if (0)
+      print_draw_arrays(ctx, mode, start, count);
+}
+
+
+/**
+ * Called from glDrawArraysInstanced when in immediate mode (not
+ * display list mode).
+ */
+static void GLAPIENTRY
+vbo_exec_DrawArraysInstanced(GLenum mode, GLint start, GLsizei count,
+                             GLsizei numInstances)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_DRAW)
+      _mesa_debug(ctx, "glDrawArraysInstanced(%s, %d, %d, %d)\n",
+                  _mesa_lookup_enum_by_nr(mode), start, count, numInstances);
+
+   if (!_mesa_validate_DrawArraysInstanced(ctx, mode, start, count, numInstances))
+      return;
+
+   FLUSH_CURRENT( ctx, 0 );
+
+   if (!_mesa_valid_to_render(ctx, "glDrawArraysInstanced")) {
+      return;
+   }
+
+   if (0)
+      check_draw_arrays_data(ctx, start, count);
+
+   vbo_draw_arrays(ctx, mode, start, count, numInstances);
+
+   if (0)
+      print_draw_arrays(ctx, mode, start, count);
+}
+
+
+/**
+ * Map GL_ELEMENT_ARRAY_BUFFER and print contents.
+ * For debugging.
+ */
+static void
+dump_element_buffer(struct gl_context *ctx, GLenum type)
+{
+   const GLvoid *map = ctx->Driver.MapBuffer(ctx,
+                                             GL_ELEMENT_ARRAY_BUFFER_ARB,
+                                             GL_READ_ONLY,
+                                             ctx->Array.ElementArrayBufferObj);
+   switch (type) {
+   case GL_UNSIGNED_BYTE:
+      {
+         const GLubyte *us = (const GLubyte *) map;
+         GLint i;
+         for (i = 0; i < ctx->Array.ElementArrayBufferObj->Size; i++) {
+            printf("%02x ", us[i]);
+            if (i % 32 == 31)
+               printf("\n");
+         }
+         printf("\n");
+      }
+      break;
+   case GL_UNSIGNED_SHORT:
+      {
+         const GLushort *us = (const GLushort *) map;
+         GLint i;
+         for (i = 0; i < ctx->Array.ElementArrayBufferObj->Size / 2; i++) {
+            printf("%04x ", us[i]);
+            if (i % 16 == 15)
+               printf("\n");
+         }
+         printf("\n");
+      }
+      break;
+   case GL_UNSIGNED_INT:
+      {
+         const GLuint *us = (const GLuint *) map;
+         GLint i;
+         for (i = 0; i < ctx->Array.ElementArrayBufferObj->Size / 4; i++) {
+            printf("%08x ", us[i]);
+            if (i % 8 == 7)
+               printf("\n");
+         }
+         printf("\n");
+      }
+      break;
+   default:
+      ;
+   }
+
+   ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER_ARB,
+                           ctx->Array.ElementArrayBufferObj);
+}
+
+
+/**
+ * Inner support for both _mesa_DrawElements and _mesa_DrawRangeElements.
+ * Do the rendering for a glDrawElements or glDrawRangeElements call after
+ * we've validated buffer bounds, etc.
+ */
+static void
+vbo_validated_drawrangeelements(struct gl_context *ctx, GLenum mode,
+				GLboolean index_bounds_valid,
+				GLuint start, GLuint end,
+				GLsizei count, GLenum type,
+				const GLvoid *indices,
+				GLint basevertex, GLint numInstances)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   struct vbo_exec_context *exec = &vbo->exec;
+   struct _mesa_index_buffer ib;
+   struct _mesa_prim prim[1];
+
+   FLUSH_CURRENT( ctx, 0 );
+
+   if (!_mesa_valid_to_render(ctx, "glDraw[Range]Elements")) {
+      return;
+   }
+
+   bind_arrays( ctx );
+
+   /* check for dirty state again */
+   if (ctx->NewState)
+      _mesa_update_state( ctx );
+
+   ib.count = count;
+   ib.type = type;
+   ib.obj = ctx->Array.ElementArrayBufferObj;
+   ib.ptr = indices;
+
+   prim[0].begin = 1;
+   prim[0].end = 1;
+   prim[0].weak = 0;
+   prim[0].pad = 0;
+   prim[0].mode = mode;
+   prim[0].start = 0;
+   prim[0].count = count;
+   prim[0].indexed = 1;
+   prim[0].basevertex = basevertex;
+   prim[0].num_instances = numInstances;
+
+   /* Need to give special consideration to rendering a range of
+    * indices starting somewhere above zero.  Typically the
+    * application is issuing multiple DrawRangeElements() to draw
+    * successive primitives layed out linearly in the vertex arrays.
+    * Unless the vertex arrays are all in a VBO (or locked as with
+    * CVA), the OpenGL semantics imply that we need to re-read or
+    * re-upload the vertex data on each draw call.  
+    *
+    * In the case of hardware tnl, we want to avoid starting the
+    * upload at zero, as it will mean every draw call uploads an
+    * increasing amount of not-used vertex data.  Worse - in the
+    * software tnl module, all those vertices might be transformed and
+    * lit but never rendered.
+    *
+    * If we just upload or transform the vertices in start..end,
+    * however, the indices will be incorrect.
+    *
+    * At this level, we don't know exactly what the requirements of
+    * the backend are going to be, though it will likely boil down to
+    * either:
+    *
+    * 1) Do nothing, everything is in a VBO and is processed once
+    *       only.
+    *
+    * 2) Adjust the indices and vertex arrays so that start becomes
+    *    zero.
+    *
+    * Rather than doing anything here, I'll provide a helper function
+    * for the latter case elsewhere.
+    */
+
+   vbo->draw_prims( ctx, exec->array.inputs, prim, 1, &ib,
+		    index_bounds_valid, start, end );
+}
+
+
+/**
+ * Called by glDrawRangeElementsBaseVertex() in immediate mode.
+ */
+static void GLAPIENTRY
+vbo_exec_DrawRangeElementsBaseVertex(GLenum mode,
+				     GLuint start, GLuint end,
+				     GLsizei count, GLenum type,
+				     const GLvoid *indices,
+				     GLint basevertex)
+{
+   static GLuint warnCount = 0;
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_DRAW)
+      _mesa_debug(ctx,
+                "glDrawRangeElementsBaseVertex(%s, %u, %u, %d, %s, %p, %d)\n",
+                _mesa_lookup_enum_by_nr(mode), start, end, count,
+                _mesa_lookup_enum_by_nr(type), indices, basevertex);
+
+   if (!_mesa_validate_DrawRangeElements( ctx, mode, start, end, count,
+                                          type, indices, basevertex ))
+      return;
+
+   /* NOTE: It's important that 'end' is a reasonable value.
+    * in _tnl_draw_prims(), we use end to determine how many vertices
+    * to transform.  If it's too large, we can unnecessarily split prims
+    * or we can read/write out of memory in several different places!
+    */
+
+   /* Catch/fix some potential user errors */
+   if (type == GL_UNSIGNED_BYTE) {
+      start = MIN2(start, 0xff);
+      end = MIN2(end, 0xff);
+   }
+   else if (type == GL_UNSIGNED_SHORT) {
+      start = MIN2(start, 0xffff);
+      end = MIN2(end, 0xffff);
+   }
+
+   if (end >= ctx->Array.ArrayObj->_MaxElement) {
+      /* the max element is out of bounds of one or more enabled arrays */
+      warnCount++;
+
+      if (warnCount < 10) {
+         _mesa_warning(ctx, "glDraw[Range]Elements(start %u, end %u, count %d, "
+                       "type 0x%x, indices=%p)\n"
+                       "\tend is out of bounds (max=%u)  "
+                       "Element Buffer %u (size %d)\n"
+                       "\tThis should probably be fixed in the application.",
+                       start, end, count, type, indices,
+                       ctx->Array.ArrayObj->_MaxElement - 1,
+                       ctx->Array.ElementArrayBufferObj->Name,
+                       (int) ctx->Array.ElementArrayBufferObj->Size);
+      }
+
+      if (0)
+         dump_element_buffer(ctx, type);
+
+      if (0)
+         _mesa_print_arrays(ctx);
+
+#ifdef DEBUG
+      /* 'end' was out of bounds, but now let's check the actual array
+       * indexes to see if any of them are out of bounds.
+       */
+      {
+         GLuint max = _mesa_max_buffer_index(ctx, count, type, indices,
+                                             ctx->Array.ElementArrayBufferObj);
+         if (max >= ctx->Array.ArrayObj->_MaxElement) {
+            if (warnCount < 10) {
+               _mesa_warning(ctx, "glDraw[Range]Elements(start %u, end %u, "
+                             "count %d, type 0x%x, indices=%p)\n"
+                             "\tindex=%u is out of bounds (max=%u)  "
+                             "Element Buffer %u (size %d)\n"
+                             "\tSkipping the glDrawRangeElements() call",
+                             start, end, count, type, indices, max,
+                             ctx->Array.ArrayObj->_MaxElement - 1,
+                             ctx->Array.ElementArrayBufferObj->Name,
+                             (int) ctx->Array.ElementArrayBufferObj->Size);
+            }
+         }
+         /* XXX we could also find the min index and compare to 'start'
+          * to see if start is correct.  But it's more likely to get the
+          * upper bound wrong.
+          */
+      }
+#endif
+
+      /* Set 'end' to the max possible legal value */
+      assert(ctx->Array.ArrayObj->_MaxElement >= 1);
+      end = ctx->Array.ArrayObj->_MaxElement - 1;
+   }
+   else if (0) {
+      printf("glDraw[Range]Elements{,BaseVertex}"
+	     "(start %u, end %u, type 0x%x, count %d) ElemBuf %u, "
+	     "base %d\n",
+	     start, end, type, count,
+	     ctx->Array.ElementArrayBufferObj->Name,
+	     basevertex);
+   }
+
+#if 0
+   check_draw_elements_data(ctx, count, type, indices);
+#else
+   (void) check_draw_elements_data;
+#endif
+
+   vbo_validated_drawrangeelements(ctx, mode, GL_TRUE, start, end,
+				   count, type, indices, basevertex, 1);
+}
+
+
+/**
+ * Called by glDrawRangeElements() in immediate mode.
+ */
+static void GLAPIENTRY
+vbo_exec_DrawRangeElements(GLenum mode, GLuint start, GLuint end,
+                           GLsizei count, GLenum type, const GLvoid *indices)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_DRAW)
+      _mesa_debug(ctx,
+                  "glDrawRangeElements(%s, %u, %u, %d, %s, %p)\n",
+                  _mesa_lookup_enum_by_nr(mode), start, end, count,
+                  _mesa_lookup_enum_by_nr(type), indices);
+
+   vbo_exec_DrawRangeElementsBaseVertex(mode, start, end, count, type,
+					indices, 0);
+}
+
+
+/**
+ * Called by glDrawElements() in immediate mode.
+ */
+static void GLAPIENTRY
+vbo_exec_DrawElements(GLenum mode, GLsizei count, GLenum type,
+                      const GLvoid *indices)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_DRAW)
+      _mesa_debug(ctx, "glDrawElements(%s, %u, %s, %p)\n",
+                  _mesa_lookup_enum_by_nr(mode), count,
+                  _mesa_lookup_enum_by_nr(type), indices);
+
+   if (!_mesa_validate_DrawElements( ctx, mode, count, type, indices, 0 ))
+      return;
+
+   vbo_validated_drawrangeelements(ctx, mode, GL_FALSE, ~0, ~0,
+				   count, type, indices, 0, 1);
+}
+
+
+/**
+ * Called by glDrawElementsBaseVertex() in immediate mode.
+ */
+static void GLAPIENTRY
+vbo_exec_DrawElementsBaseVertex(GLenum mode, GLsizei count, GLenum type,
+				const GLvoid *indices, GLint basevertex)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_DRAW)
+      _mesa_debug(ctx, "glDrawElementsBaseVertex(%s, %d, %s, %p, %d)\n",
+                  _mesa_lookup_enum_by_nr(mode), count,
+                  _mesa_lookup_enum_by_nr(type), indices, basevertex);
+
+   if (!_mesa_validate_DrawElements( ctx, mode, count, type, indices,
+				     basevertex ))
+      return;
+
+   vbo_validated_drawrangeelements(ctx, mode, GL_FALSE, ~0, ~0,
+				   count, type, indices, basevertex, 1);
+}
+
+
+/**
+ * Called by glDrawElementsInstanced() in immediate mode.
+ */
+static void GLAPIENTRY
+vbo_exec_DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type,
+                               const GLvoid *indices, GLsizei numInstances)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_DRAW)
+      _mesa_debug(ctx, "glDrawElementsInstanced(%s, %d, %s, %p, %d)\n",
+                  _mesa_lookup_enum_by_nr(mode), count,
+                  _mesa_lookup_enum_by_nr(type), indices, numInstances);
+
+   if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type, indices,
+                                             numInstances))
+      return;
+
+   vbo_validated_drawrangeelements(ctx, mode, GL_FALSE, ~0, ~0,
+				   count, type, indices, 0, numInstances);
+}
+
+
+/**
+ * Inner support for both _mesa_MultiDrawElements() and
+ * _mesa_MultiDrawRangeElements().
+ * This does the actual rendering after we've checked array indexes, etc.
+ */
+static void
+vbo_validated_multidrawelements(struct gl_context *ctx, GLenum mode,
+				const GLsizei *count, GLenum type,
+				const GLvoid **indices, GLsizei primcount,
+				const GLint *basevertex)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   struct vbo_exec_context *exec = &vbo->exec;
+   struct _mesa_index_buffer ib;
+   struct _mesa_prim *prim;
+   unsigned int index_type_size = 0;
+   uintptr_t min_index_ptr, max_index_ptr;
+   GLboolean fallback = GL_FALSE;
+   int i;
+
+   if (primcount == 0)
+      return;
+
+   FLUSH_CURRENT( ctx, 0 );
+
+   if (!_mesa_valid_to_render(ctx, "glMultiDrawElements")) {
+      return;
+   }
+
+   prim = calloc(1, primcount * sizeof(*prim));
+   if (prim == NULL) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glMultiDrawElements");
+      return;
+   }
+
+   /* Decide if we can do this all as one set of primitives sharing the
+    * same index buffer, or if we have to reset the index pointer per
+    * primitive.
+    */
+   bind_arrays( ctx );
+
+   /* check for dirty state again */
+   if (ctx->NewState)
+      _mesa_update_state( ctx );
+
+   switch (type) {
+   case GL_UNSIGNED_INT:
+      index_type_size = 4;
+      break;
+   case GL_UNSIGNED_SHORT:
+      index_type_size = 2;
+      break;
+   case GL_UNSIGNED_BYTE:
+      index_type_size = 1;
+      break;
+   default:
+      assert(0);
+   }
+
+   min_index_ptr = (uintptr_t)indices[0];
+   max_index_ptr = 0;
+   for (i = 0; i < primcount; i++) {
+      min_index_ptr = MIN2(min_index_ptr, (uintptr_t)indices[i]);
+      max_index_ptr = MAX2(max_index_ptr, (uintptr_t)indices[i] +
+			   index_type_size * count[i]);
+   }
+
+   /* Check if we can handle this thing as a bunch of index offsets from the
+    * same index pointer.  If we can't, then we have to fall back to doing
+    * a draw_prims per primitive.
+    * Check that the difference between each prim's indexes is a multiple of
+    * the index/element size.
+    */
+   if (index_type_size != 1) {
+      for (i = 0; i < primcount; i++) {
+	 if ((((uintptr_t)indices[i] - min_index_ptr) % index_type_size) != 0) {
+	    fallback = GL_TRUE;
+	    break;
+	 }
+      }
+   }
+
+   /* If the index buffer isn't in a VBO, then treating the application's
+    * subranges of the index buffer as one large index buffer may lead to
+    * us reading unmapped memory.
+    */
+   if (!_mesa_is_bufferobj(ctx->Array.ElementArrayBufferObj))
+      fallback = GL_TRUE;
+
+   if (!fallback) {
+      ib.count = (max_index_ptr - min_index_ptr) / index_type_size;
+      ib.type = type;
+      ib.obj = ctx->Array.ElementArrayBufferObj;
+      ib.ptr = (void *)min_index_ptr;
+
+      for (i = 0; i < primcount; i++) {
+	 prim[i].begin = (i == 0);
+	 prim[i].end = (i == primcount - 1);
+	 prim[i].weak = 0;
+	 prim[i].pad = 0;
+	 prim[i].mode = mode;
+	 prim[i].start = ((uintptr_t)indices[i] - min_index_ptr) / index_type_size;
+	 prim[i].count = count[i];
+	 prim[i].indexed = 1;
+         prim[i].num_instances = 1;
+	 if (basevertex != NULL)
+	    prim[i].basevertex = basevertex[i];
+	 else
+	    prim[i].basevertex = 0;
+      }
+
+      vbo->draw_prims(ctx, exec->array.inputs, prim, primcount, &ib,
+		      GL_FALSE, ~0, ~0);
+   } else {
+      /* render one prim at a time */
+      for (i = 0; i < primcount; i++) {
+	 ib.count = count[i];
+	 ib.type = type;
+	 ib.obj = ctx->Array.ElementArrayBufferObj;
+	 ib.ptr = indices[i];
+
+	 prim[0].begin = 1;
+	 prim[0].end = 1;
+	 prim[0].weak = 0;
+	 prim[0].pad = 0;
+	 prim[0].mode = mode;
+	 prim[0].start = 0;
+	 prim[0].count = count[i];
+	 prim[0].indexed = 1;
+         prim[0].num_instances = 1;
+	 if (basevertex != NULL)
+	    prim[0].basevertex = basevertex[i];
+	 else
+	    prim[0].basevertex = 0;
+
+         vbo->draw_prims(ctx, exec->array.inputs, prim, 1, &ib,
+                         GL_FALSE, ~0, ~0);
+      }
+   }
+
+   free(prim);
+}
+
+
+static void GLAPIENTRY
+vbo_exec_MultiDrawElements(GLenum mode,
+			   const GLsizei *count, GLenum type,
+			   const GLvoid **indices,
+			   GLsizei primcount)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   GLint i;
+
+   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
+
+   for (i = 0; i < primcount; i++) {
+      if (!_mesa_validate_DrawElements(ctx, mode, count[i], type, indices[i],
+				       0))
+	 return;
+   }
+
+   vbo_validated_multidrawelements(ctx, mode, count, type, indices, primcount,
+				   NULL);
+}
+
+
+static void GLAPIENTRY
+vbo_exec_MultiDrawElementsBaseVertex(GLenum mode,
+				     const GLsizei *count, GLenum type,
+				     const GLvoid **indices,
+				     GLsizei primcount,
+				     const GLsizei *basevertex)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   GLint i;
+
+   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
+
+   for (i = 0; i < primcount; i++) {
+      if (!_mesa_validate_DrawElements(ctx, mode, count[i], type, indices[i],
+				       basevertex[i]))
+	 return;
+   }
+
+   vbo_validated_multidrawelements(ctx, mode, count, type, indices, primcount,
+				   basevertex);
+}
+
+
+/**
+ * Plug in the immediate-mode vertex array drawing commands into the
+ * givven vbo_exec_context object.
+ */
+void
+vbo_exec_array_init( struct vbo_exec_context *exec )
+{
+   exec->vtxfmt.DrawArrays = vbo_exec_DrawArrays;
+   exec->vtxfmt.DrawElements = vbo_exec_DrawElements;
+   exec->vtxfmt.DrawRangeElements = vbo_exec_DrawRangeElements;
+   exec->vtxfmt.MultiDrawElementsEXT = vbo_exec_MultiDrawElements;
+   exec->vtxfmt.DrawElementsBaseVertex = vbo_exec_DrawElementsBaseVertex;
+   exec->vtxfmt.DrawRangeElementsBaseVertex = vbo_exec_DrawRangeElementsBaseVertex;
+   exec->vtxfmt.MultiDrawElementsBaseVertex = vbo_exec_MultiDrawElementsBaseVertex;
+   exec->vtxfmt.DrawArraysInstanced = vbo_exec_DrawArraysInstanced;
+   exec->vtxfmt.DrawElementsInstanced = vbo_exec_DrawElementsInstanced;
+}
+
+
+void
+vbo_exec_array_destroy( struct vbo_exec_context *exec )
+{
+   /* nothing to do */
+}
+
+
+
+/**
+ * The following functions are only used for OpenGL ES 1/2 support.
+ * And some aren't even supported (yet) in ES 1/2.
+ */
+
+
+void GLAPIENTRY
+_mesa_DrawArrays(GLenum mode, GLint first, GLsizei count)
+{
+   vbo_exec_DrawArrays(mode, first, count);
+}
+
+
+void GLAPIENTRY
+_mesa_DrawElements(GLenum mode, GLsizei count, GLenum type,
+                   const GLvoid *indices)
+{
+   vbo_exec_DrawElements(mode, count, type, indices);
+}
+
+
+void GLAPIENTRY
+_mesa_DrawElementsBaseVertex(GLenum mode, GLsizei count, GLenum type,
+			     const GLvoid *indices, GLint basevertex)
+{
+   vbo_exec_DrawElementsBaseVertex(mode, count, type, indices, basevertex);
+}
+
+
+void GLAPIENTRY
+_mesa_DrawRangeElements(GLenum mode, GLuint start, GLuint end, GLsizei count,
+                        GLenum type, const GLvoid *indices)
+{
+   vbo_exec_DrawRangeElements(mode, start, end, count, type, indices);
+}
+
+
+void GLAPIENTRY
+_mesa_DrawRangeElementsBaseVertex(GLenum mode, GLuint start, GLuint end,
+				  GLsizei count, GLenum type,
+				  const GLvoid *indices, GLint basevertex)
+{
+   vbo_exec_DrawRangeElementsBaseVertex(mode, start, end, count, type,
+					indices, basevertex);
+}
+
+
+void GLAPIENTRY
+_mesa_MultiDrawElementsEXT(GLenum mode, const GLsizei *count, GLenum type,
+			   const GLvoid **indices, GLsizei primcount)
+{
+   vbo_exec_MultiDrawElements(mode, count, type, indices, primcount);
+}
+
+
+void GLAPIENTRY
+_mesa_MultiDrawElementsBaseVertex(GLenum mode,
+				  const GLsizei *count, GLenum type,
+				  const GLvoid **indices, GLsizei primcount,
+				  const GLint *basevertex)
+{
+   vbo_exec_MultiDrawElementsBaseVertex(mode, count, type, indices,
+					primcount, basevertex);
+}
diff --git a/mesalib/src/mesa/vbo/vbo_exec_draw.c b/mesalib/src/mesa/vbo/vbo_exec_draw.c
index 048f3d170..f8be83ea8 100644
--- a/mesalib/src/mesa/vbo/vbo_exec_draw.c
+++ b/mesalib/src/mesa/vbo/vbo_exec_draw.c
@@ -1,420 +1,421 @@
-/*
- * Mesa 3-D graphics library
- * Version:  7.2
- *
- * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Keith Whitwell <keith@tungstengraphics.com>
- */
-
-#include "main/glheader.h"
-#include "main/bufferobj.h"
-#include "main/compiler.h"
-#include "main/enums.h"
-#include "main/mfeatures.h"
-#include "main/state.h"
-
-#include "vbo_context.h"
-
-
-#if FEATURE_beginend
-
-
-static void
-vbo_exec_debug_verts( struct vbo_exec_context *exec )
-{
-   GLuint count = exec->vtx.vert_count;
-   GLuint i;
-
-   printf("%s: %u vertices %d primitives, %d vertsize\n",
-	  __FUNCTION__,
-	  count,
-	  exec->vtx.prim_count,
-	  exec->vtx.vertex_size);
-
-   for (i = 0 ; i < exec->vtx.prim_count ; i++) {
-      struct _mesa_prim *prim = &exec->vtx.prim[i];
-      printf("   prim %d: %s%s %d..%d %s %s\n",
-	     i, 
-	     _mesa_lookup_prim_by_nr(prim->mode),
-	     prim->weak ? " (weak)" : "",
-	     prim->start, 
-	     prim->start + prim->count,
-	     prim->begin ? "BEGIN" : "(wrap)",
-	     prim->end ? "END" : "(wrap)");
-   }
-}
-
-
-/*
- * NOTE: Need to have calculated primitives by this point -- do it on the fly.
- * NOTE: Old 'parity' issue is gone.
- */
-static GLuint
-vbo_copy_vertices( struct vbo_exec_context *exec )
-{
-   GLuint nr = exec->vtx.prim[exec->vtx.prim_count-1].count;
-   GLuint ovf, i;
-   GLuint sz = exec->vtx.vertex_size;
-   GLfloat *dst = exec->vtx.copied.buffer;
-   const GLfloat *src = (exec->vtx.buffer_map + 
-                         exec->vtx.prim[exec->vtx.prim_count-1].start * 
-                         exec->vtx.vertex_size);
-
-
-   switch (exec->ctx->Driver.CurrentExecPrimitive) {
-   case GL_POINTS:
-      return 0;
-   case GL_LINES:
-      ovf = nr&1;
-      for (i = 0 ; i < ovf ; i++)
-	 memcpy( dst+i*sz, src+(nr-ovf+i)*sz, sz * sizeof(GLfloat) );
-      return i;
-   case GL_TRIANGLES:
-      ovf = nr%3;
-      for (i = 0 ; i < ovf ; i++)
-	 memcpy( dst+i*sz, src+(nr-ovf+i)*sz, sz * sizeof(GLfloat) );
-      return i;
-   case GL_QUADS:
-      ovf = nr&3;
-      for (i = 0 ; i < ovf ; i++)
-	 memcpy( dst+i*sz, src+(nr-ovf+i)*sz, sz * sizeof(GLfloat) );
-      return i;
-   case GL_LINE_STRIP:
-      if (nr == 0) {
-	 return 0;
-      }
-      else {
-	 memcpy( dst, src+(nr-1)*sz, sz * sizeof(GLfloat) );
-	 return 1;
-      }
-   case GL_LINE_LOOP:
-   case GL_TRIANGLE_FAN:
-   case GL_POLYGON:
-      if (nr == 0) {
-	 return 0;
-      }
-      else if (nr == 1) {
-	 memcpy( dst, src+0, sz * sizeof(GLfloat) );
-	 return 1;
-      }
-      else {
-	 memcpy( dst, src+0, sz * sizeof(GLfloat) );
-	 memcpy( dst+sz, src+(nr-1)*sz, sz * sizeof(GLfloat) );
-	 return 2;
-      }
-   case GL_TRIANGLE_STRIP:
-      /* no parity issue, but need to make sure the tri is not drawn twice */
-      if (nr & 1) {
-	 exec->vtx.prim[exec->vtx.prim_count-1].count--;
-      }
-      /* fallthrough */
-   case GL_QUAD_STRIP:
-      switch (nr) {
-      case 0:
-         ovf = 0;
-         break;
-      case 1:
-         ovf = 1;
-         break;
-      default:
-         ovf = 2 + (nr & 1);
-         break;
-      }
-      for (i = 0 ; i < ovf ; i++)
-	 memcpy( dst+i*sz, src+(nr-ovf+i)*sz, sz * sizeof(GLfloat) );
-      return i;
-   case PRIM_OUTSIDE_BEGIN_END:
-      return 0;
-   default:
-      assert(0);
-      return 0;
-   }
-}
-
-
-
-/* TODO: populate these as the vertex is defined:
- */
-static void
-vbo_exec_bind_arrays( struct gl_context *ctx )
-{
-   struct vbo_context *vbo = vbo_context(ctx);
-   struct vbo_exec_context *exec = &vbo->exec;
-   struct gl_client_array *arrays = exec->vtx.arrays;
-   const GLuint count = exec->vtx.vert_count;
-   const GLuint *map;
-   GLuint attr;
-   GLbitfield varying_inputs = 0x0;
-
-   /* Install the default (ie Current) attributes first, then overlay
-    * all active ones.
-    */
-   switch (get_program_mode(exec->ctx)) {
-   case VP_NONE:
-      for (attr = 0; attr < 16; attr++) {
-         exec->vtx.inputs[attr] = &vbo->legacy_currval[attr];
-      }
-      for (attr = 0; attr < MAT_ATTRIB_MAX; attr++) {
-         ASSERT(attr + 16 < Elements(exec->vtx.inputs));
-         exec->vtx.inputs[attr + 16] = &vbo->mat_currval[attr];
-      }
-      map = vbo->map_vp_none;
-      break;
-   case VP_NV:
-   case VP_ARB:
-      /* The aliasing of attributes for NV vertex programs has already
-       * occurred.  NV vertex programs cannot access material values,
-       * nor attributes greater than VERT_ATTRIB_TEX7.  
-       */
-      for (attr = 0; attr < 16; attr++) {
-         exec->vtx.inputs[attr] = &vbo->legacy_currval[attr];
-         ASSERT(attr + 16 < Elements(exec->vtx.inputs));
-         exec->vtx.inputs[attr + 16] = &vbo->generic_currval[attr];
-      }
-      map = vbo->map_vp_arb;
-
-      /* check if VERT_ATTRIB_POS is not read but VERT_BIT_GENERIC0 is read.
-       * In that case we effectively need to route the data from
-       * glVertexAttrib(0, val) calls to feed into the GENERIC0 input.
-       */
-      if ((ctx->VertexProgram._Current->Base.InputsRead & VERT_BIT_POS) == 0 &&
-          (ctx->VertexProgram._Current->Base.InputsRead & VERT_BIT_GENERIC0)) {
-         exec->vtx.inputs[16] = exec->vtx.inputs[0];
-         exec->vtx.attrsz[16] = exec->vtx.attrsz[0];
-         exec->vtx.attrptr[16] = exec->vtx.attrptr[0];
-         exec->vtx.attrsz[0] = 0;
-      }
-      break;
-   default:
-      assert(0);
-   }
-
-   /* Make all active attributes (including edgeflag) available as
-    * arrays of floats.
-    */
-   for (attr = 0; attr < VERT_ATTRIB_MAX ; attr++) {
-      const GLuint src = map[attr];
-
-      if (exec->vtx.attrsz[src]) {
-	 GLsizeiptr offset = (GLbyte *)exec->vtx.attrptr[src] -
-	    (GLbyte *)exec->vtx.vertex;
-
-         /* override the default array set above */
-         ASSERT(attr < Elements(exec->vtx.inputs));
-         ASSERT(attr < Elements(exec->vtx.arrays)); /* arrays[] */
-         exec->vtx.inputs[attr] = &arrays[attr];
-
-         if (_mesa_is_bufferobj(exec->vtx.bufferobj)) {
-            /* a real buffer obj: Ptr is an offset, not a pointer*/
-            assert(exec->vtx.bufferobj->Pointer);  /* buf should be mapped */
-            assert(offset >= 0);
-            arrays[attr].Ptr = (GLubyte *)exec->vtx.bufferobj->Offset + offset;
-         }
-         else {
-            /* Ptr into ordinary app memory */
-            arrays[attr].Ptr = (GLubyte *)exec->vtx.buffer_map + offset;
-         }
-	 arrays[attr].Size = exec->vtx.attrsz[src];
-	 arrays[attr].StrideB = exec->vtx.vertex_size * sizeof(GLfloat);
-	 arrays[attr].Stride = exec->vtx.vertex_size * sizeof(GLfloat);
-	 arrays[attr].Type = GL_FLOAT;
-         arrays[attr].Format = GL_RGBA;
-	 arrays[attr].Enabled = 1;
-         _mesa_reference_buffer_object(ctx,
-                                       &arrays[attr].BufferObj,
-                                       exec->vtx.bufferobj);
-	 arrays[attr]._MaxElement = count; /* ??? */
-
-         varying_inputs |= 1 << attr;
-      }
-   }
-
-   _mesa_set_varying_vp_inputs( ctx, varying_inputs );
-}
-
-
-static void
-vbo_exec_vtx_unmap( struct vbo_exec_context *exec )
-{
-   GLenum target = GL_ARRAY_BUFFER_ARB;
-
-   if (_mesa_is_bufferobj(exec->vtx.bufferobj)) {
-      struct gl_context *ctx = exec->ctx;
-      
-      if (ctx->Driver.FlushMappedBufferRange) {
-         GLintptr offset = exec->vtx.buffer_used - exec->vtx.bufferobj->Offset;
-         GLsizeiptr length = (exec->vtx.buffer_ptr - exec->vtx.buffer_map) * sizeof(float);
-
-         if (length)
-            ctx->Driver.FlushMappedBufferRange(ctx, target,
-                                               offset, length,
-                                               exec->vtx.bufferobj);
-      }
-
-      exec->vtx.buffer_used += (exec->vtx.buffer_ptr -
-                                exec->vtx.buffer_map) * sizeof(float);
-
-      assert(exec->vtx.buffer_used <= VBO_VERT_BUFFER_SIZE);
-      assert(exec->vtx.buffer_ptr != NULL);
-      
-      ctx->Driver.UnmapBuffer(ctx, target, exec->vtx.bufferobj);
-      exec->vtx.buffer_map = NULL;
-      exec->vtx.buffer_ptr = NULL;
-      exec->vtx.max_vert = 0;
-   }
-}
-
-
-void
-vbo_exec_vtx_map( struct vbo_exec_context *exec )
-{
-   struct gl_context *ctx = exec->ctx;
-   const GLenum target = GL_ARRAY_BUFFER_ARB;
-   const GLenum access = GL_READ_WRITE_ARB; /* for MapBuffer */
-   const GLenum accessRange = GL_MAP_WRITE_BIT |  /* for MapBufferRange */
-                              GL_MAP_INVALIDATE_RANGE_BIT |
-                              GL_MAP_UNSYNCHRONIZED_BIT |
-                              GL_MAP_FLUSH_EXPLICIT_BIT |
-                              MESA_MAP_NOWAIT_BIT;
-   const GLenum usage = GL_STREAM_DRAW_ARB;
-   
-   if (!_mesa_is_bufferobj(exec->vtx.bufferobj))
-      return;
-
-   if (exec->vtx.buffer_map != NULL) {
-      assert(0);
-      exec->vtx.buffer_map = NULL;
-      exec->vtx.buffer_ptr = NULL;
-   }
-
-   if (VBO_VERT_BUFFER_SIZE > exec->vtx.buffer_used + 1024 &&
-       ctx->Driver.MapBufferRange) {
-      exec->vtx.buffer_map = 
-         (GLfloat *)ctx->Driver.MapBufferRange(ctx, 
-                                               target, 
-                                               exec->vtx.buffer_used,
-                                               (VBO_VERT_BUFFER_SIZE - 
-                                                exec->vtx.buffer_used),
-                                               accessRange,
-                                               exec->vtx.bufferobj);
-      exec->vtx.buffer_ptr = exec->vtx.buffer_map;
-   }
-   
-   if (!exec->vtx.buffer_map) {
-      exec->vtx.buffer_used = 0;
-
-      ctx->Driver.BufferData(ctx, target, 
-                             VBO_VERT_BUFFER_SIZE, 
-                             NULL, usage, exec->vtx.bufferobj);
-
-
-      if (ctx->Driver.MapBufferRange)
-         exec->vtx.buffer_map = 
-            (GLfloat *)ctx->Driver.MapBufferRange(ctx, target,
-                                                  0, VBO_VERT_BUFFER_SIZE,
-                                                  accessRange,
-                                                  exec->vtx.bufferobj);
-      if (!exec->vtx.buffer_map)
-         exec->vtx.buffer_map =
-            (GLfloat *)ctx->Driver.MapBuffer(ctx, target, access, exec->vtx.bufferobj);
-      assert(exec->vtx.buffer_map);
-      exec->vtx.buffer_ptr = exec->vtx.buffer_map;
-   }
-
-   if (0)
-      printf("map %d..\n", exec->vtx.buffer_used);
-}
-
-
-
-/**
- * Execute the buffer and save copied verts.
- */
-void
-vbo_exec_vtx_flush( struct vbo_exec_context *exec, GLboolean unmap )
-{
-   if (0)
-      vbo_exec_debug_verts( exec );
-
-   if (exec->vtx.prim_count && 
-       exec->vtx.vert_count) {
-
-      exec->vtx.copied.nr = vbo_copy_vertices( exec ); 
-
-      if (exec->vtx.copied.nr != exec->vtx.vert_count) {
-	 struct gl_context *ctx = exec->ctx;
-	 
-	 /* Before the update_state() as this may raise _NEW_ARRAY
-          * from _mesa_set_varying_vp_inputs().
-	  */
-	 vbo_exec_bind_arrays( ctx );
-
-         if (ctx->NewState)
-            _mesa_update_state( ctx );
-
-         if (_mesa_is_bufferobj(exec->vtx.bufferobj)) {
-            vbo_exec_vtx_unmap( exec );
-         }
-
-         if (0)
-            printf("%s %d %d\n", __FUNCTION__, exec->vtx.prim_count,
-		   exec->vtx.vert_count);
-
-	 vbo_context(ctx)->draw_prims( ctx, 
-				       exec->vtx.inputs, 
-				       exec->vtx.prim, 
-				       exec->vtx.prim_count,
-				       NULL,
-				       GL_TRUE,
-				       0,
-				       exec->vtx.vert_count - 1);
-
-	 /* If using a real VBO, get new storage -- unless asked not to.
-          */
-         if (_mesa_is_bufferobj(exec->vtx.bufferobj) && !unmap) {
-            vbo_exec_vtx_map( exec );
-         }
-      }
-   }
-
-   /* May have to unmap explicitly if we didn't draw:
-    */
-   if (unmap && 
-       _mesa_is_bufferobj(exec->vtx.bufferobj) &&
-       exec->vtx.buffer_map) {
-      vbo_exec_vtx_unmap( exec );
-   }
-
-
-   if (unmap || exec->vtx.vertex_size == 0)
-      exec->vtx.max_vert = 0;
-   else
-      exec->vtx.max_vert = ((VBO_VERT_BUFFER_SIZE - exec->vtx.buffer_used) / 
-                            (exec->vtx.vertex_size * sizeof(GLfloat)));
-
-   exec->vtx.buffer_ptr = exec->vtx.buffer_map;
-   exec->vtx.prim_count = 0;
-   exec->vtx.vert_count = 0;
-}
-
-
-#endif /* FEATURE_beginend */
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.2
+ *
+ * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "main/glheader.h"
+#include "main/bufferobj.h"
+#include "main/compiler.h"
+#include "main/enums.h"
+#include "main/mfeatures.h"
+#include "main/state.h"
+
+#include "vbo_context.h"
+
+
+#if FEATURE_beginend
+
+
+static void
+vbo_exec_debug_verts( struct vbo_exec_context *exec )
+{
+   GLuint count = exec->vtx.vert_count;
+   GLuint i;
+
+   printf("%s: %u vertices %d primitives, %d vertsize\n",
+	  __FUNCTION__,
+	  count,
+	  exec->vtx.prim_count,
+	  exec->vtx.vertex_size);
+
+   for (i = 0 ; i < exec->vtx.prim_count ; i++) {
+      struct _mesa_prim *prim = &exec->vtx.prim[i];
+      printf("   prim %d: %s%s %d..%d %s %s\n",
+	     i, 
+	     _mesa_lookup_prim_by_nr(prim->mode),
+	     prim->weak ? " (weak)" : "",
+	     prim->start, 
+	     prim->start + prim->count,
+	     prim->begin ? "BEGIN" : "(wrap)",
+	     prim->end ? "END" : "(wrap)");
+   }
+}
+
+
+/*
+ * NOTE: Need to have calculated primitives by this point -- do it on the fly.
+ * NOTE: Old 'parity' issue is gone.
+ */
+static GLuint
+vbo_copy_vertices( struct vbo_exec_context *exec )
+{
+   GLuint nr = exec->vtx.prim[exec->vtx.prim_count-1].count;
+   GLuint ovf, i;
+   GLuint sz = exec->vtx.vertex_size;
+   GLfloat *dst = exec->vtx.copied.buffer;
+   const GLfloat *src = (exec->vtx.buffer_map + 
+                         exec->vtx.prim[exec->vtx.prim_count-1].start * 
+                         exec->vtx.vertex_size);
+
+
+   switch (exec->ctx->Driver.CurrentExecPrimitive) {
+   case GL_POINTS:
+      return 0;
+   case GL_LINES:
+      ovf = nr&1;
+      for (i = 0 ; i < ovf ; i++)
+	 memcpy( dst+i*sz, src+(nr-ovf+i)*sz, sz * sizeof(GLfloat) );
+      return i;
+   case GL_TRIANGLES:
+      ovf = nr%3;
+      for (i = 0 ; i < ovf ; i++)
+	 memcpy( dst+i*sz, src+(nr-ovf+i)*sz, sz * sizeof(GLfloat) );
+      return i;
+   case GL_QUADS:
+      ovf = nr&3;
+      for (i = 0 ; i < ovf ; i++)
+	 memcpy( dst+i*sz, src+(nr-ovf+i)*sz, sz * sizeof(GLfloat) );
+      return i;
+   case GL_LINE_STRIP:
+      if (nr == 0) {
+	 return 0;
+      }
+      else {
+	 memcpy( dst, src+(nr-1)*sz, sz * sizeof(GLfloat) );
+	 return 1;
+      }
+   case GL_LINE_LOOP:
+   case GL_TRIANGLE_FAN:
+   case GL_POLYGON:
+      if (nr == 0) {
+	 return 0;
+      }
+      else if (nr == 1) {
+	 memcpy( dst, src+0, sz * sizeof(GLfloat) );
+	 return 1;
+      }
+      else {
+	 memcpy( dst, src+0, sz * sizeof(GLfloat) );
+	 memcpy( dst+sz, src+(nr-1)*sz, sz * sizeof(GLfloat) );
+	 return 2;
+      }
+   case GL_TRIANGLE_STRIP:
+      /* no parity issue, but need to make sure the tri is not drawn twice */
+      if (nr & 1) {
+	 exec->vtx.prim[exec->vtx.prim_count-1].count--;
+      }
+      /* fallthrough */
+   case GL_QUAD_STRIP:
+      switch (nr) {
+      case 0:
+         ovf = 0;
+         break;
+      case 1:
+         ovf = 1;
+         break;
+      default:
+         ovf = 2 + (nr & 1);
+         break;
+      }
+      for (i = 0 ; i < ovf ; i++)
+	 memcpy( dst+i*sz, src+(nr-ovf+i)*sz, sz * sizeof(GLfloat) );
+      return i;
+   case PRIM_OUTSIDE_BEGIN_END:
+      return 0;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+
+
+/* TODO: populate these as the vertex is defined:
+ */
+static void
+vbo_exec_bind_arrays( struct gl_context *ctx )
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   struct vbo_exec_context *exec = &vbo->exec;
+   struct gl_client_array *arrays = exec->vtx.arrays;
+   const GLuint count = exec->vtx.vert_count;
+   const GLuint *map;
+   GLuint attr;
+   GLbitfield varying_inputs = 0x0;
+
+   /* Install the default (ie Current) attributes first, then overlay
+    * all active ones.
+    */
+   switch (get_program_mode(exec->ctx)) {
+   case VP_NONE:
+      for (attr = 0; attr < 16; attr++) {
+         exec->vtx.inputs[attr] = &vbo->legacy_currval[attr];
+      }
+      for (attr = 0; attr < MAT_ATTRIB_MAX; attr++) {
+         ASSERT(attr + 16 < Elements(exec->vtx.inputs));
+         exec->vtx.inputs[attr + 16] = &vbo->mat_currval[attr];
+      }
+      map = vbo->map_vp_none;
+      break;
+   case VP_NV:
+   case VP_ARB:
+      /* The aliasing of attributes for NV vertex programs has already
+       * occurred.  NV vertex programs cannot access material values,
+       * nor attributes greater than VERT_ATTRIB_TEX7.  
+       */
+      for (attr = 0; attr < 16; attr++) {
+         exec->vtx.inputs[attr] = &vbo->legacy_currval[attr];
+         ASSERT(attr + 16 < Elements(exec->vtx.inputs));
+         exec->vtx.inputs[attr + 16] = &vbo->generic_currval[attr];
+      }
+      map = vbo->map_vp_arb;
+
+      /* check if VERT_ATTRIB_POS is not read but VERT_BIT_GENERIC0 is read.
+       * In that case we effectively need to route the data from
+       * glVertexAttrib(0, val) calls to feed into the GENERIC0 input.
+       */
+      if ((ctx->VertexProgram._Current->Base.InputsRead & VERT_BIT_POS) == 0 &&
+          (ctx->VertexProgram._Current->Base.InputsRead & VERT_BIT_GENERIC0)) {
+         exec->vtx.inputs[16] = exec->vtx.inputs[0];
+         exec->vtx.attrsz[16] = exec->vtx.attrsz[0];
+         exec->vtx.attrptr[16] = exec->vtx.attrptr[0];
+         exec->vtx.attrsz[0] = 0;
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   /* Make all active attributes (including edgeflag) available as
+    * arrays of floats.
+    */
+   for (attr = 0; attr < VERT_ATTRIB_MAX ; attr++) {
+      const GLuint src = map[attr];
+
+      if (exec->vtx.attrsz[src]) {
+	 GLsizeiptr offset = (GLbyte *)exec->vtx.attrptr[src] -
+	    (GLbyte *)exec->vtx.vertex;
+
+         /* override the default array set above */
+         ASSERT(attr < Elements(exec->vtx.inputs));
+         ASSERT(attr < Elements(exec->vtx.arrays)); /* arrays[] */
+         exec->vtx.inputs[attr] = &arrays[attr];
+
+         if (_mesa_is_bufferobj(exec->vtx.bufferobj)) {
+            /* a real buffer obj: Ptr is an offset, not a pointer*/
+            assert(exec->vtx.bufferobj->Pointer);  /* buf should be mapped */
+            assert(offset >= 0);
+            arrays[attr].Ptr = (GLubyte *)exec->vtx.bufferobj->Offset + offset;
+         }
+         else {
+            /* Ptr into ordinary app memory */
+            arrays[attr].Ptr = (GLubyte *)exec->vtx.buffer_map + offset;
+         }
+	 arrays[attr].Size = exec->vtx.attrsz[src];
+	 arrays[attr].StrideB = exec->vtx.vertex_size * sizeof(GLfloat);
+	 arrays[attr].Stride = exec->vtx.vertex_size * sizeof(GLfloat);
+	 arrays[attr].Type = GL_FLOAT;
+         arrays[attr].Format = GL_RGBA;
+	 arrays[attr].Enabled = 1;
+         _mesa_reference_buffer_object(ctx,
+                                       &arrays[attr].BufferObj,
+                                       exec->vtx.bufferobj);
+	 arrays[attr]._MaxElement = count; /* ??? */
+
+         varying_inputs |= 1 << attr;
+         ctx->NewState |= _NEW_ARRAY;
+      }
+   }
+
+   _mesa_set_varying_vp_inputs( ctx, varying_inputs );
+}
+
+
+static void
+vbo_exec_vtx_unmap( struct vbo_exec_context *exec )
+{
+   GLenum target = GL_ARRAY_BUFFER_ARB;
+
+   if (_mesa_is_bufferobj(exec->vtx.bufferobj)) {
+      struct gl_context *ctx = exec->ctx;
+      
+      if (ctx->Driver.FlushMappedBufferRange) {
+         GLintptr offset = exec->vtx.buffer_used - exec->vtx.bufferobj->Offset;
+         GLsizeiptr length = (exec->vtx.buffer_ptr - exec->vtx.buffer_map) * sizeof(float);
+
+         if (length)
+            ctx->Driver.FlushMappedBufferRange(ctx, target,
+                                               offset, length,
+                                               exec->vtx.bufferobj);
+      }
+
+      exec->vtx.buffer_used += (exec->vtx.buffer_ptr -
+                                exec->vtx.buffer_map) * sizeof(float);
+
+      assert(exec->vtx.buffer_used <= VBO_VERT_BUFFER_SIZE);
+      assert(exec->vtx.buffer_ptr != NULL);
+      
+      ctx->Driver.UnmapBuffer(ctx, target, exec->vtx.bufferobj);
+      exec->vtx.buffer_map = NULL;
+      exec->vtx.buffer_ptr = NULL;
+      exec->vtx.max_vert = 0;
+   }
+}
+
+
+void
+vbo_exec_vtx_map( struct vbo_exec_context *exec )
+{
+   struct gl_context *ctx = exec->ctx;
+   const GLenum target = GL_ARRAY_BUFFER_ARB;
+   const GLenum access = GL_READ_WRITE_ARB; /* for MapBuffer */
+   const GLenum accessRange = GL_MAP_WRITE_BIT |  /* for MapBufferRange */
+                              GL_MAP_INVALIDATE_RANGE_BIT |
+                              GL_MAP_UNSYNCHRONIZED_BIT |
+                              GL_MAP_FLUSH_EXPLICIT_BIT |
+                              MESA_MAP_NOWAIT_BIT;
+   const GLenum usage = GL_STREAM_DRAW_ARB;
+   
+   if (!_mesa_is_bufferobj(exec->vtx.bufferobj))
+      return;
+
+   if (exec->vtx.buffer_map != NULL) {
+      assert(0);
+      exec->vtx.buffer_map = NULL;
+      exec->vtx.buffer_ptr = NULL;
+   }
+
+   if (VBO_VERT_BUFFER_SIZE > exec->vtx.buffer_used + 1024 &&
+       ctx->Driver.MapBufferRange) {
+      exec->vtx.buffer_map = 
+         (GLfloat *)ctx->Driver.MapBufferRange(ctx, 
+                                               target, 
+                                               exec->vtx.buffer_used,
+                                               (VBO_VERT_BUFFER_SIZE - 
+                                                exec->vtx.buffer_used),
+                                               accessRange,
+                                               exec->vtx.bufferobj);
+      exec->vtx.buffer_ptr = exec->vtx.buffer_map;
+   }
+   
+   if (!exec->vtx.buffer_map) {
+      exec->vtx.buffer_used = 0;
+
+      ctx->Driver.BufferData(ctx, target, 
+                             VBO_VERT_BUFFER_SIZE, 
+                             NULL, usage, exec->vtx.bufferobj);
+
+
+      if (ctx->Driver.MapBufferRange)
+         exec->vtx.buffer_map = 
+            (GLfloat *)ctx->Driver.MapBufferRange(ctx, target,
+                                                  0, VBO_VERT_BUFFER_SIZE,
+                                                  accessRange,
+                                                  exec->vtx.bufferobj);
+      if (!exec->vtx.buffer_map)
+         exec->vtx.buffer_map =
+            (GLfloat *)ctx->Driver.MapBuffer(ctx, target, access, exec->vtx.bufferobj);
+      assert(exec->vtx.buffer_map);
+      exec->vtx.buffer_ptr = exec->vtx.buffer_map;
+   }
+
+   if (0)
+      printf("map %d..\n", exec->vtx.buffer_used);
+}
+
+
+
+/**
+ * Execute the buffer and save copied verts.
+ */
+void
+vbo_exec_vtx_flush( struct vbo_exec_context *exec, GLboolean unmap )
+{
+   if (0)
+      vbo_exec_debug_verts( exec );
+
+   if (exec->vtx.prim_count && 
+       exec->vtx.vert_count) {
+
+      exec->vtx.copied.nr = vbo_copy_vertices( exec ); 
+
+      if (exec->vtx.copied.nr != exec->vtx.vert_count) {
+	 struct gl_context *ctx = exec->ctx;
+	 
+	 /* Before the update_state() as this may raise _NEW_ARRAY
+          * from _mesa_set_varying_vp_inputs().
+	  */
+	 vbo_exec_bind_arrays( ctx );
+
+         if (ctx->NewState)
+            _mesa_update_state( ctx );
+
+         if (_mesa_is_bufferobj(exec->vtx.bufferobj)) {
+            vbo_exec_vtx_unmap( exec );
+         }
+
+         if (0)
+            printf("%s %d %d\n", __FUNCTION__, exec->vtx.prim_count,
+		   exec->vtx.vert_count);
+
+	 vbo_context(ctx)->draw_prims( ctx, 
+				       exec->vtx.inputs, 
+				       exec->vtx.prim, 
+				       exec->vtx.prim_count,
+				       NULL,
+				       GL_TRUE,
+				       0,
+				       exec->vtx.vert_count - 1);
+
+	 /* If using a real VBO, get new storage -- unless asked not to.
+          */
+         if (_mesa_is_bufferobj(exec->vtx.bufferobj) && !unmap) {
+            vbo_exec_vtx_map( exec );
+         }
+      }
+   }
+
+   /* May have to unmap explicitly if we didn't draw:
+    */
+   if (unmap && 
+       _mesa_is_bufferobj(exec->vtx.bufferobj) &&
+       exec->vtx.buffer_map) {
+      vbo_exec_vtx_unmap( exec );
+   }
+
+
+   if (unmap || exec->vtx.vertex_size == 0)
+      exec->vtx.max_vert = 0;
+   else
+      exec->vtx.max_vert = ((VBO_VERT_BUFFER_SIZE - exec->vtx.buffer_used) / 
+                            (exec->vtx.vertex_size * sizeof(GLfloat)));
+
+   exec->vtx.buffer_ptr = exec->vtx.buffer_map;
+   exec->vtx.prim_count = 0;
+   exec->vtx.vert_count = 0;
+}
+
+
+#endif /* FEATURE_beginend */
diff --git a/mesalib/src/mesa/vbo/vbo_save_draw.c b/mesalib/src/mesa/vbo/vbo_save_draw.c
index 6d8dbdb86..634a6d3f8 100644
--- a/mesalib/src/mesa/vbo/vbo_save_draw.c
+++ b/mesalib/src/mesa/vbo/vbo_save_draw.c
@@ -1,304 +1,305 @@
-/*
- * Mesa 3-D graphics library
- * Version:  7.2
- *
- * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/* Author:
- *    Keith Whitwell <keith@tungstengraphics.com>
- */
-
-#include "main/glheader.h"
-#include "main/bufferobj.h"
-#include "main/context.h"
-#include "main/imports.h"
-#include "main/mfeatures.h"
-#include "main/mtypes.h"
-#include "main/macros.h"
-#include "main/light.h"
-#include "main/state.h"
-
-#include "vbo_context.h"
-
-
-#if FEATURE_dlist
-
-
-/**
- * After playback, copy everything but the position from the
- * last vertex to the saved state
- */
-static void
-_playback_copy_to_current(struct gl_context *ctx,
-                          const struct vbo_save_vertex_list *node)
-{
-   struct vbo_context *vbo = vbo_context(ctx);
-   GLfloat vertex[VBO_ATTRIB_MAX * 4];
-   GLfloat *data;
-   GLuint i, offset;
-
-   if (node->current_size == 0)
-      return;
-
-   if (node->current_data) {
-      data = node->current_data;
-   }
-   else {
-      data = vertex;
-
-      if (node->count)
-         offset = (node->buffer_offset + 
-                   (node->count-1) * node->vertex_size * sizeof(GLfloat));
-      else
-         offset = node->buffer_offset;
-
-      ctx->Driver.GetBufferSubData( ctx, 0, offset, 
-                                    node->vertex_size * sizeof(GLfloat), 
-                                    data, node->vertex_store->bufferobj );
-
-      data += node->attrsz[0]; /* skip vertex position */
-   }
-
-   for (i = VBO_ATTRIB_POS+1 ; i < VBO_ATTRIB_MAX ; i++) {
-      if (node->attrsz[i]) {
-	 GLfloat *current = (GLfloat *)vbo->currval[i].Ptr;
-         GLfloat tmp[4];
-
-         COPY_CLEAN_4V(tmp, 
-                       node->attrsz[i], 
-                       data);
-         
-         if (memcmp(current, tmp, 4 * sizeof(GLfloat)) != 0) {
-            memcpy(current, tmp, 4 * sizeof(GLfloat));
-
-            vbo->currval[i].Size = node->attrsz[i];
-
-            if (i >= VBO_ATTRIB_FIRST_MATERIAL &&
-                i <= VBO_ATTRIB_LAST_MATERIAL)
-               ctx->NewState |= _NEW_LIGHT;
-
-            ctx->NewState |= _NEW_CURRENT_ATTRIB;
-         }
-
-	 data += node->attrsz[i];
-      }
-   }
-
-   /* Colormaterial -- this kindof sucks.
-    */
-   if (ctx->Light.ColorMaterialEnabled) {
-      _mesa_update_color_material(ctx, ctx->Current.Attrib[VBO_ATTRIB_COLOR0]);
-   }
-
-   /* CurrentExecPrimitive
-    */
-   if (node->prim_count) {
-      const struct _mesa_prim *prim = &node->prim[node->prim_count - 1];
-      if (prim->end)
-	 ctx->Driver.CurrentExecPrimitive = PRIM_OUTSIDE_BEGIN_END;
-      else
-	 ctx->Driver.CurrentExecPrimitive = prim->mode;
-   }
-}
-
-
-
-/**
- * Treat the vertex storage as a VBO, define vertex arrays pointing
- * into it:
- */
-static void vbo_bind_vertex_list(struct gl_context *ctx,
-                                 const struct vbo_save_vertex_list *node)
-{
-   struct vbo_context *vbo = vbo_context(ctx);
-   struct vbo_save_context *save = &vbo->save;
-   struct gl_client_array *arrays = save->arrays;
-   GLuint buffer_offset = node->buffer_offset;
-   const GLuint *map;
-   GLuint attr;
-   GLubyte node_attrsz[VBO_ATTRIB_MAX];  /* copy of node->attrsz[] */
-   GLbitfield varying_inputs = 0x0;
-
-   memcpy(node_attrsz, node->attrsz, sizeof(node->attrsz));
-
-   /* Install the default (ie Current) attributes first, then overlay
-    * all active ones.
-    */
-   switch (get_program_mode(ctx)) {
-   case VP_NONE:
-      for (attr = 0; attr < 16; attr++) {
-         save->inputs[attr] = &vbo->legacy_currval[attr];
-      }
-      for (attr = 0; attr < MAT_ATTRIB_MAX; attr++) {
-         save->inputs[attr + 16] = &vbo->mat_currval[attr];
-      }
-      map = vbo->map_vp_none;
-      break;
-   case VP_NV:
-   case VP_ARB:
-      /* The aliasing of attributes for NV vertex programs has already
-       * occurred.  NV vertex programs cannot access material values,
-       * nor attributes greater than VERT_ATTRIB_TEX7.  
-       */
-      for (attr = 0; attr < 16; attr++) {
-         save->inputs[attr] = &vbo->legacy_currval[attr];
-         save->inputs[attr + 16] = &vbo->generic_currval[attr];
-      }
-      map = vbo->map_vp_arb;
-
-      /* check if VERT_ATTRIB_POS is not read but VERT_BIT_GENERIC0 is read.
-       * In that case we effectively need to route the data from
-       * glVertexAttrib(0, val) calls to feed into the GENERIC0 input.
-       */
-      if ((ctx->VertexProgram._Current->Base.InputsRead & VERT_BIT_POS) == 0 &&
-          (ctx->VertexProgram._Current->Base.InputsRead & VERT_BIT_GENERIC0)) {
-         save->inputs[16] = save->inputs[0];
-         node_attrsz[16] = node_attrsz[0];
-         node_attrsz[0] = 0;
-      }
-      break;
-   default:
-      assert(0);
-   }
-
-   for (attr = 0; attr < VERT_ATTRIB_MAX; attr++) {
-      const GLuint src = map[attr];
-
-      if (node_attrsz[src]) {
-         /* override the default array set above */
-         save->inputs[attr] = &arrays[attr];
-
-	 arrays[attr].Ptr = (const GLubyte *) NULL + buffer_offset;
-	 arrays[attr].Size = node->attrsz[src];
-	 arrays[attr].StrideB = node->vertex_size * sizeof(GLfloat);
-	 arrays[attr].Stride = node->vertex_size * sizeof(GLfloat);
-	 arrays[attr].Type = GL_FLOAT;
-         arrays[attr].Format = GL_RGBA;
-	 arrays[attr].Enabled = 1;
-         _mesa_reference_buffer_object(ctx,
-                                       &arrays[attr].BufferObj,
-                                       node->vertex_store->bufferobj);
-	 arrays[attr]._MaxElement = node->count; /* ??? */
-	 
-	 assert(arrays[attr].BufferObj->Name);
-
-	 buffer_offset += node->attrsz[src] * sizeof(GLfloat);
-         varying_inputs |= 1<<attr;
-      }
-   }
-
-   _mesa_set_varying_vp_inputs( ctx, varying_inputs );
-}
-
-
-static void
-vbo_save_loopback_vertex_list(struct gl_context *ctx,
-                              const struct vbo_save_vertex_list *list)
-{
-   const char *buffer = ctx->Driver.MapBuffer(ctx, 
-					      GL_ARRAY_BUFFER_ARB, 
-					      GL_READ_ONLY, /* ? */
-                                              list->vertex_store->bufferobj);
-
-   vbo_loopback_vertex_list(ctx,
-                            (const GLfloat *)(buffer + list->buffer_offset),
-                            list->attrsz,
-                            list->prim,
-                            list->prim_count,
-                            list->wrap_count,
-                            list->vertex_size);
-
-   ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER_ARB, 
-			   list->vertex_store->bufferobj);
-}
-
-
-/**
- * Execute the buffer and save copied verts.
- * This is called from the display list code when executing
- * a drawing command.
- */
-void
-vbo_save_playback_vertex_list(struct gl_context *ctx, void *data)
-{
-   const struct vbo_save_vertex_list *node =
-      (const struct vbo_save_vertex_list *) data;
-   struct vbo_save_context *save = &vbo_context(ctx)->save;
-
-   FLUSH_CURRENT(ctx, 0);
-
-   if (node->prim_count > 0 && node->count > 0) {
-
-      if (ctx->Driver.CurrentExecPrimitive != PRIM_OUTSIDE_BEGIN_END &&
-	  node->prim[0].begin) {
-
-	 /* Degenerate case: list is called inside begin/end pair and
-	  * includes operations such as glBegin or glDrawArrays.
-	  */
-	 if (0)
-	    printf("displaylist recursive begin");
-
-	 vbo_save_loopback_vertex_list( ctx, node );
-	 return;
-      }
-      else if (save->replay_flags) {
-	 /* Various degnerate cases: translate into immediate mode
-	  * calls rather than trying to execute in place.
-	  */
-	 vbo_save_loopback_vertex_list( ctx, node );
-	 return;
-      }
-      
-      if (ctx->NewState)
-	 _mesa_update_state( ctx );
-
-      /* XXX also need to check if shader enabled, but invalid */
-      if ((ctx->VertexProgram.Enabled && !ctx->VertexProgram._Enabled) ||
-          (ctx->FragmentProgram.Enabled && !ctx->FragmentProgram._Enabled)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glBegin (invalid vertex/fragment program)");
-         return;
-      }
-
-      vbo_bind_vertex_list( ctx, node );
-
-      /* Again...
-       */
-      if (ctx->NewState)
-	 _mesa_update_state( ctx );
-
-      vbo_context(ctx)->draw_prims(ctx, 
-                                   save->inputs, 
-                                   node->prim, 
-                                   node->prim_count,
-                                   NULL,
-                                   GL_TRUE,
-                                   0,	/* Node is a VBO, so this is ok */
-                                   node->count - 1);
-   }
-
-   /* Copy to current?
-    */
-   _playback_copy_to_current( ctx, node );
-}
-
-
-#endif /* FEATURE_dlist */
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.2
+ *
+ * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "main/glheader.h"
+#include "main/bufferobj.h"
+#include "main/context.h"
+#include "main/imports.h"
+#include "main/mfeatures.h"
+#include "main/mtypes.h"
+#include "main/macros.h"
+#include "main/light.h"
+#include "main/state.h"
+
+#include "vbo_context.h"
+
+
+#if FEATURE_dlist
+
+
+/**
+ * After playback, copy everything but the position from the
+ * last vertex to the saved state
+ */
+static void
+_playback_copy_to_current(struct gl_context *ctx,
+                          const struct vbo_save_vertex_list *node)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   GLfloat vertex[VBO_ATTRIB_MAX * 4];
+   GLfloat *data;
+   GLuint i, offset;
+
+   if (node->current_size == 0)
+      return;
+
+   if (node->current_data) {
+      data = node->current_data;
+   }
+   else {
+      data = vertex;
+
+      if (node->count)
+         offset = (node->buffer_offset + 
+                   (node->count-1) * node->vertex_size * sizeof(GLfloat));
+      else
+         offset = node->buffer_offset;
+
+      ctx->Driver.GetBufferSubData( ctx, 0, offset, 
+                                    node->vertex_size * sizeof(GLfloat), 
+                                    data, node->vertex_store->bufferobj );
+
+      data += node->attrsz[0]; /* skip vertex position */
+   }
+
+   for (i = VBO_ATTRIB_POS+1 ; i < VBO_ATTRIB_MAX ; i++) {
+      if (node->attrsz[i]) {
+	 GLfloat *current = (GLfloat *)vbo->currval[i].Ptr;
+         GLfloat tmp[4];
+
+         COPY_CLEAN_4V(tmp, 
+                       node->attrsz[i], 
+                       data);
+         
+         if (memcmp(current, tmp, 4 * sizeof(GLfloat)) != 0) {
+            memcpy(current, tmp, 4 * sizeof(GLfloat));
+
+            vbo->currval[i].Size = node->attrsz[i];
+
+            if (i >= VBO_ATTRIB_FIRST_MATERIAL &&
+                i <= VBO_ATTRIB_LAST_MATERIAL)
+               ctx->NewState |= _NEW_LIGHT;
+
+            ctx->NewState |= _NEW_CURRENT_ATTRIB;
+         }
+
+	 data += node->attrsz[i];
+      }
+   }
+
+   /* Colormaterial -- this kindof sucks.
+    */
+   if (ctx->Light.ColorMaterialEnabled) {
+      _mesa_update_color_material(ctx, ctx->Current.Attrib[VBO_ATTRIB_COLOR0]);
+   }
+
+   /* CurrentExecPrimitive
+    */
+   if (node->prim_count) {
+      const struct _mesa_prim *prim = &node->prim[node->prim_count - 1];
+      if (prim->end)
+	 ctx->Driver.CurrentExecPrimitive = PRIM_OUTSIDE_BEGIN_END;
+      else
+	 ctx->Driver.CurrentExecPrimitive = prim->mode;
+   }
+}
+
+
+
+/**
+ * Treat the vertex storage as a VBO, define vertex arrays pointing
+ * into it:
+ */
+static void vbo_bind_vertex_list(struct gl_context *ctx,
+                                 const struct vbo_save_vertex_list *node)
+{
+   struct vbo_context *vbo = vbo_context(ctx);
+   struct vbo_save_context *save = &vbo->save;
+   struct gl_client_array *arrays = save->arrays;
+   GLuint buffer_offset = node->buffer_offset;
+   const GLuint *map;
+   GLuint attr;
+   GLubyte node_attrsz[VBO_ATTRIB_MAX];  /* copy of node->attrsz[] */
+   GLbitfield varying_inputs = 0x0;
+
+   memcpy(node_attrsz, node->attrsz, sizeof(node->attrsz));
+
+   /* Install the default (ie Current) attributes first, then overlay
+    * all active ones.
+    */
+   switch (get_program_mode(ctx)) {
+   case VP_NONE:
+      for (attr = 0; attr < 16; attr++) {
+         save->inputs[attr] = &vbo->legacy_currval[attr];
+      }
+      for (attr = 0; attr < MAT_ATTRIB_MAX; attr++) {
+         save->inputs[attr + 16] = &vbo->mat_currval[attr];
+      }
+      map = vbo->map_vp_none;
+      break;
+   case VP_NV:
+   case VP_ARB:
+      /* The aliasing of attributes for NV vertex programs has already
+       * occurred.  NV vertex programs cannot access material values,
+       * nor attributes greater than VERT_ATTRIB_TEX7.  
+       */
+      for (attr = 0; attr < 16; attr++) {
+         save->inputs[attr] = &vbo->legacy_currval[attr];
+         save->inputs[attr + 16] = &vbo->generic_currval[attr];
+      }
+      map = vbo->map_vp_arb;
+
+      /* check if VERT_ATTRIB_POS is not read but VERT_BIT_GENERIC0 is read.
+       * In that case we effectively need to route the data from
+       * glVertexAttrib(0, val) calls to feed into the GENERIC0 input.
+       */
+      if ((ctx->VertexProgram._Current->Base.InputsRead & VERT_BIT_POS) == 0 &&
+          (ctx->VertexProgram._Current->Base.InputsRead & VERT_BIT_GENERIC0)) {
+         save->inputs[16] = save->inputs[0];
+         node_attrsz[16] = node_attrsz[0];
+         node_attrsz[0] = 0;
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   for (attr = 0; attr < VERT_ATTRIB_MAX; attr++) {
+      const GLuint src = map[attr];
+
+      if (node_attrsz[src]) {
+         /* override the default array set above */
+         save->inputs[attr] = &arrays[attr];
+
+	 arrays[attr].Ptr = (const GLubyte *) NULL + buffer_offset;
+	 arrays[attr].Size = node->attrsz[src];
+	 arrays[attr].StrideB = node->vertex_size * sizeof(GLfloat);
+	 arrays[attr].Stride = node->vertex_size * sizeof(GLfloat);
+	 arrays[attr].Type = GL_FLOAT;
+         arrays[attr].Format = GL_RGBA;
+	 arrays[attr].Enabled = 1;
+         _mesa_reference_buffer_object(ctx,
+                                       &arrays[attr].BufferObj,
+                                       node->vertex_store->bufferobj);
+	 arrays[attr]._MaxElement = node->count; /* ??? */
+	 
+	 assert(arrays[attr].BufferObj->Name);
+
+	 buffer_offset += node->attrsz[src] * sizeof(GLfloat);
+         varying_inputs |= 1<<attr;
+         ctx->NewState |= _NEW_ARRAY;
+      }
+   }
+
+   _mesa_set_varying_vp_inputs( ctx, varying_inputs );
+}
+
+
+static void
+vbo_save_loopback_vertex_list(struct gl_context *ctx,
+                              const struct vbo_save_vertex_list *list)
+{
+   const char *buffer = ctx->Driver.MapBuffer(ctx, 
+					      GL_ARRAY_BUFFER_ARB, 
+					      GL_READ_ONLY, /* ? */
+                                              list->vertex_store->bufferobj);
+
+   vbo_loopback_vertex_list(ctx,
+                            (const GLfloat *)(buffer + list->buffer_offset),
+                            list->attrsz,
+                            list->prim,
+                            list->prim_count,
+                            list->wrap_count,
+                            list->vertex_size);
+
+   ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER_ARB, 
+			   list->vertex_store->bufferobj);
+}
+
+
+/**
+ * Execute the buffer and save copied verts.
+ * This is called from the display list code when executing
+ * a drawing command.
+ */
+void
+vbo_save_playback_vertex_list(struct gl_context *ctx, void *data)
+{
+   const struct vbo_save_vertex_list *node =
+      (const struct vbo_save_vertex_list *) data;
+   struct vbo_save_context *save = &vbo_context(ctx)->save;
+
+   FLUSH_CURRENT(ctx, 0);
+
+   if (node->prim_count > 0 && node->count > 0) {
+
+      if (ctx->Driver.CurrentExecPrimitive != PRIM_OUTSIDE_BEGIN_END &&
+	  node->prim[0].begin) {
+
+	 /* Degenerate case: list is called inside begin/end pair and
+	  * includes operations such as glBegin or glDrawArrays.
+	  */
+	 if (0)
+	    printf("displaylist recursive begin");
+
+	 vbo_save_loopback_vertex_list( ctx, node );
+	 return;
+      }
+      else if (save->replay_flags) {
+	 /* Various degnerate cases: translate into immediate mode
+	  * calls rather than trying to execute in place.
+	  */
+	 vbo_save_loopback_vertex_list( ctx, node );
+	 return;
+      }
+      
+      if (ctx->NewState)
+	 _mesa_update_state( ctx );
+
+      /* XXX also need to check if shader enabled, but invalid */
+      if ((ctx->VertexProgram.Enabled && !ctx->VertexProgram._Enabled) ||
+          (ctx->FragmentProgram.Enabled && !ctx->FragmentProgram._Enabled)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "glBegin (invalid vertex/fragment program)");
+         return;
+      }
+
+      vbo_bind_vertex_list( ctx, node );
+
+      /* Again...
+       */
+      if (ctx->NewState)
+	 _mesa_update_state( ctx );
+
+      vbo_context(ctx)->draw_prims(ctx, 
+                                   save->inputs, 
+                                   node->prim, 
+                                   node->prim_count,
+                                   NULL,
+                                   GL_TRUE,
+                                   0,	/* Node is a VBO, so this is ok */
+                                   node->count - 1);
+   }
+
+   /* Copy to current?
+    */
+   _playback_copy_to_current( ctx, node );
+}
+
+
+#endif /* FEATURE_dlist */
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index 63b08c1fb..062c58a89 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -1,4 +1,4 @@
-SUBDIRS = pixman test
+SUBDIRS = pixman test demos
 
 pkgconfigdir=$(libdir)/pkgconfig
 pkgconfig_DATA=pixman-1.pc
diff --git a/pixman/configure.ac b/pixman/configure.ac
index ab2ecde1b..5242799bb 100644
--- a/pixman/configure.ac
+++ b/pixman/configure.ac
@@ -794,6 +794,7 @@ AC_OUTPUT([pixman-1.pc
            Makefile
 	   pixman/Makefile
 	   pixman/pixman-version.h
+	   demos/Makefile
 	   test/Makefile])
 
 m4_if(m4_eval(pixman_minor % 2), [1], [
diff --git a/pixman/demos/Makefile.am b/pixman/demos/Makefile.am
new file mode 100644
index 000000000..2dcdfd350
--- /dev/null
+++ b/pixman/demos/Makefile.am
@@ -0,0 +1,34 @@
+if HAVE_GTK
+
+AM_CFLAGS = @OPENMP_CFLAGS@
+AM_LDFLAGS = @OPENMP_CFLAGS@
+
+LDADD = $(GTK_LIBS) $(top_builddir)/pixman/libpixman-1.la -lm 
+INCLUDES = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman $(GTK_CFLAGS)
+
+GTK_UTILS = gtk-utils.c gtk-utils.h
+
+DEMOS =				\
+	clip-test		\
+	clip-in			\
+	composite-test		\
+	gradient-test		\
+	radial-test		\
+	alpha-test		\
+	screen-test		\
+	convolution-test	\
+	trap-test
+
+gradient_test_SOURCES = gradient-test.c $(GTK_UTILS)
+alpha_test_SOURCES = alpha-test.c $(GTK_UTILS)
+composite_test_SOURCES = composite-test.c $(GTK_UTILS)
+clip_test_SOURCES = clip-test.c $(GTK_UTILS)
+clip_in_SOURCES = clip-in.c $(GTK_UTILS)
+trap_test_SOURCES = trap-test.c $(GTK_UTILS)
+screen_test_SOURCES = screen-test.c $(GTK_UTILS)
+convolution_test_SOURCES = convolution-test.c $(GTK_UTILS)
+radial_test_SOURCES = radial-test.c ../test/utils.c ../test/utils.h $(GTK_UTILS)
+
+noinst_PROGRAMS = $(DEMOS)
+
+endif
diff --git a/pixman/demos/alpha-test.c b/pixman/demos/alpha-test.c
new file mode 100644
index 000000000..92c208142
--- /dev/null
+++ b/pixman/demos/alpha-test.c
@@ -0,0 +1,117 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 400
+#define HEIGHT 200
+    
+    uint32_t *alpha = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *src = malloc (WIDTH * HEIGHT * 4);
+    pixman_image_t *grad_img;
+    pixman_image_t *alpha_img;
+    pixman_image_t *dest_img;
+    pixman_image_t *src_img;
+    int i;
+    pixman_gradient_stop_t stops[2] =
+	{
+	    { pixman_int_to_fixed (0), { 0x0000, 0x0000, 0x0000, 0x0000 } },
+	    { pixman_int_to_fixed (1), { 0xffff, 0x0000, 0x1111, 0xffff } }
+	};
+    pixman_point_fixed_t p1 = { pixman_double_to_fixed (0), 0 };
+    pixman_point_fixed_t p2 = { pixman_double_to_fixed (WIDTH),
+				pixman_int_to_fixed (0) };
+#if 0
+    pixman_transform_t trans = {
+	{ { pixman_double_to_fixed (2), pixman_double_to_fixed (0.5), pixman_double_to_fixed (-100), },
+	  { pixman_double_to_fixed (0), pixman_double_to_fixed (3), pixman_double_to_fixed (0), },
+	  { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } 
+	}
+    };
+#else
+    pixman_transform_t trans = {
+	{ { pixman_fixed_1, 0, 0 },
+	  { 0, pixman_fixed_1, 0 },
+	  { 0, 0, pixman_fixed_1 } }
+    };
+#endif
+
+    pixman_point_fixed_t c_inner;
+    pixman_point_fixed_t c_outer;
+    pixman_fixed_t r_inner;
+    pixman_fixed_t r_outer;
+    
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+	alpha[i] = 0x4f00004f; /* pale blue */
+    
+    alpha_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					 WIDTH, HEIGHT, 
+					  alpha,
+					 WIDTH * 4);
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+	dest[i] = 0xffffff00;		/* yellow */
+    
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					 WIDTH, HEIGHT, 
+					 dest,
+					 WIDTH * 4);
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+	src[i] = 0xffff0000;
+
+    src_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					WIDTH, HEIGHT,
+					src,
+					WIDTH * 4);
+    
+    c_inner.x = pixman_double_to_fixed (50.0);
+    c_inner.y = pixman_double_to_fixed (50.0);
+    c_outer.x = pixman_double_to_fixed (50.0);
+    c_outer.y = pixman_double_to_fixed (50.0);
+    r_inner = 0;
+    r_outer = pixman_double_to_fixed (50.0);
+    
+#if 0
+    grad_img = pixman_image_create_conical_gradient (&c_inner, r_inner,
+						    stops, 2);
+#endif
+#if 0
+    grad_img = pixman_image_create_conical_gradient (&c_inner, r_inner,
+						    stops, 2);
+    grad_img = pixman_image_create_linear_gradient (&c_inner, &c_outer,
+						   r_inner, r_outer,
+						   stops, 2);
+#endif
+    
+    grad_img = pixman_image_create_linear_gradient  (&p1, &p2,
+						    stops, 2);
+
+    pixman_image_set_transform (grad_img, &trans);
+    pixman_image_set_repeat (grad_img, PIXMAN_REPEAT_PAD);
+    
+    pixman_image_composite (PIXMAN_OP_OVER, grad_img, NULL, alpha_img,
+			    0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT);
+
+    pixman_image_set_alpha_map (src_img, alpha_img, 10, 10);
+    
+    pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dest_img,
+			    0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT);
+    
+    printf ("0, 0: %x\n", dest[0]);
+    printf ("10, 10: %x\n", dest[10 * 10 + 10]);
+    printf ("w, h: %x\n", dest[(HEIGHT - 1) * 100 + (WIDTH - 1)]);
+    
+    show_image (dest_img);
+
+    pixman_image_unref (src_img);
+    pixman_image_unref (grad_img);
+    pixman_image_unref (alpha_img);
+    free (dest);
+    
+    return 0;
+}
diff --git a/pixman/demos/clip-in.c b/pixman/demos/clip-in.c
new file mode 100644
index 000000000..51579811f
--- /dev/null
+++ b/pixman/demos/clip-in.c
@@ -0,0 +1,50 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+/* This test demonstrates that clipping is done totally different depending
+ * on whether the source is transformed or not.
+ */
+int
+main (int argc, char **argv)
+{
+#define WIDTH 200
+#define HEIGHT 200
+
+#define SMALL 25
+    
+    uint32_t *sbits = malloc (SMALL * SMALL * 4);
+    uint32_t *bits = malloc (WIDTH * HEIGHT * 4);
+    pixman_transform_t trans = {
+    {
+	{ pixman_double_to_fixed (1.0), pixman_double_to_fixed (0), pixman_double_to_fixed (-0.1), },
+	{ pixman_double_to_fixed (0), pixman_double_to_fixed (1), pixman_double_to_fixed (-0.1), },
+	{ pixman_double_to_fixed (0), pixman_double_to_fixed (0), pixman_double_to_fixed (1.0) }
+    } };
+	  
+    pixman_image_t *src_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, SMALL, SMALL, sbits, 4 * SMALL);
+    pixman_image_t *dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, 4 * WIDTH);
+
+    memset (bits, 0xff, WIDTH * HEIGHT * 4);
+    memset (sbits, 0x00, SMALL * SMALL * 4);
+
+    pixman_image_composite (PIXMAN_OP_IN,
+			    src_img, NULL, dest_img,
+			    0, 0, 0, 0, SMALL, SMALL, 200, 200);
+    
+    pixman_image_set_transform (src_img, &trans);
+    
+    pixman_image_composite (PIXMAN_OP_IN,
+			    src_img, NULL, dest_img,
+			    0, 0, 0, 0, SMALL * 2, SMALL * 2, 200, 200);
+    
+    show_image (dest_img);
+    
+    pixman_image_unref (src_img);
+    pixman_image_unref (dest_img);
+    free (bits);
+    
+    return 0;
+}
diff --git a/pixman/demos/clip-test.c b/pixman/demos/clip-test.c
new file mode 100644
index 000000000..aa0df4482
--- /dev/null
+++ b/pixman/demos/clip-test.c
@@ -0,0 +1,97 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+#define WIDTH 200
+#define HEIGHT 200
+    
+static pixman_image_t *
+create_solid_bits (uint32_t pixel)
+{
+    uint32_t *pixels = malloc (WIDTH * HEIGHT * 4);
+    int i;
+    
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+	pixels[i] = pixel;
+
+    return pixman_image_create_bits (PIXMAN_a8r8g8b8,
+				     WIDTH, HEIGHT, 
+				     pixels,
+				     WIDTH * 4);
+}
+
+int
+main (int argc, char **argv)
+{
+    pixman_image_t *gradient_img;
+    pixman_image_t *src_img, *dst_img;
+    pixman_gradient_stop_t stops[2] =
+	{
+	    { pixman_int_to_fixed (0), { 0xffff, 0x0000, 0x0000, 0xffff } },
+	    { pixman_int_to_fixed (1), { 0xffff, 0xffff, 0x0000, 0xffff } }
+	};
+#if 0
+    pixman_point_fixed_t p1 = { 0, 0 };
+    pixman_point_fixed_t p2 = { pixman_int_to_fixed (WIDTH),
+				pixman_int_to_fixed (HEIGHT) };
+#endif
+    pixman_point_fixed_t c_inner;
+    pixman_point_fixed_t c_outer;
+    pixman_fixed_t r_inner;
+    pixman_fixed_t r_outer;
+    pixman_region32_t clip_region;
+    pixman_transform_t trans = {
+	{ { pixman_double_to_fixed (1.3), pixman_double_to_fixed (0), pixman_double_to_fixed (-0.5), },
+	  { pixman_double_to_fixed (0), pixman_double_to_fixed (1), pixman_double_to_fixed (-0.5), },
+	  { pixman_double_to_fixed (0), pixman_double_to_fixed (0), pixman_double_to_fixed (1.0) } 
+	}
+    };
+    
+    src_img = create_solid_bits (0xff0000ff);
+    
+    c_inner.x = pixman_double_to_fixed (100.0);
+    c_inner.y = pixman_double_to_fixed (100.0);
+    c_outer.x = pixman_double_to_fixed (100.0);
+    c_outer.y = pixman_double_to_fixed (100.0);
+    r_inner = 0;
+    r_outer = pixman_double_to_fixed (100.0);
+    
+    gradient_img = pixman_image_create_radial_gradient (&c_inner, &c_outer,
+							r_inner, r_outer,
+							stops, 2);
+
+#if 0
+    gradient_img = pixman_image_create_linear_gradient  (&p1, &p2,
+							 stops, 2);
+    
+#endif
+
+    pixman_image_composite (PIXMAN_OP_OVER, gradient_img, NULL, src_img,
+			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+    
+    pixman_region32_init_rect (&clip_region, 50, 0, 100, 200);
+    pixman_image_set_clip_region32 (src_img, &clip_region);
+    pixman_image_set_source_clipping (src_img, TRUE);
+    pixman_image_set_has_client_clip (src_img, TRUE);
+    pixman_image_set_transform (src_img, &trans);
+    pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NORMAL);
+    
+    dst_img = create_solid_bits (0xffff0000);
+    pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dst_img,
+			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+    
+
+#if 0
+    printf ("0, 0: %x\n", src[0]);
+    printf ("10, 10: %x\n", src[10 * 10 + 10]);
+    printf ("w, h: %x\n", src[(HEIGHT - 1) * 100 + (WIDTH - 1)]);
+#endif
+    
+    show_image (dst_img);
+    
+    pixman_image_unref (gradient_img);
+    pixman_image_unref (src_img);
+    
+    return 0;
+}
diff --git a/pixman/demos/composite-test.c b/pixman/demos/composite-test.c
new file mode 100644
index 000000000..79d5d5eac
--- /dev/null
+++ b/pixman/demos/composite-test.c
@@ -0,0 +1,191 @@
+#include <gtk/gtk.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+#define WIDTH	60
+#define HEIGHT	60
+
+typedef struct {
+    const char *name;
+    pixman_op_t op;
+} operator_t;
+
+static const operator_t operators[] = {
+    { "CLEAR",		PIXMAN_OP_CLEAR },
+    { "SRC",		PIXMAN_OP_SRC },
+    { "DST",		PIXMAN_OP_DST },
+    { "OVER",		PIXMAN_OP_OVER },
+    { "OVER_REVERSE",	PIXMAN_OP_OVER_REVERSE },
+    { "IN",		PIXMAN_OP_IN },
+    { "IN_REVERSE",	PIXMAN_OP_IN_REVERSE },
+    { "OUT",		PIXMAN_OP_OUT },
+    { "OUT_REVERSE",	PIXMAN_OP_OUT_REVERSE },
+    { "ATOP",		PIXMAN_OP_ATOP },
+    { "ATOP_REVERSE",	PIXMAN_OP_ATOP_REVERSE },
+    { "XOR",		PIXMAN_OP_XOR },
+    { "ADD",		PIXMAN_OP_ADD },
+    { "SATURATE",	PIXMAN_OP_SATURATE },
+
+    { "MULTIPLY",	PIXMAN_OP_MULTIPLY },
+    { "SCREEN",		PIXMAN_OP_SCREEN },
+    { "OVERLAY",	PIXMAN_OP_OVERLAY },
+    { "DARKEN",		PIXMAN_OP_DARKEN },
+    { "LIGHTEN",	PIXMAN_OP_LIGHTEN },
+    { "COLOR_DODGE",	PIXMAN_OP_COLOR_DODGE },
+    { "COLOR_BURN",	PIXMAN_OP_COLOR_BURN },
+    { "HARD_LIGHT",	PIXMAN_OP_HARD_LIGHT },
+    { "SOFT_LIGHT",	PIXMAN_OP_SOFT_LIGHT },
+    { "DIFFERENCE",	PIXMAN_OP_DIFFERENCE },
+    { "EXCLUSION",	PIXMAN_OP_EXCLUSION },
+    { "HSL_HUE",	PIXMAN_OP_HSL_HUE },
+    { "HSL_SATURATION",	PIXMAN_OP_HSL_SATURATION },
+    { "HSL_COLOR",	PIXMAN_OP_HSL_COLOR },
+    { "HSL_LUMINOSITY",	PIXMAN_OP_HSL_LUMINOSITY },
+};
+
+static uint32_t
+reader (const void *src, int size)
+{
+    switch (size)
+    {
+    case 1:
+	return *(uint8_t *)src;
+    case 2:
+	return *(uint16_t *)src;
+    case 4:
+	return *(uint32_t *)src;
+    default:
+	g_assert_not_reached();
+    }
+}
+
+static void
+writer (void *src, uint32_t value, int size)
+{
+    switch (size)
+    {
+    case 1:
+	*(uint8_t *)src = value;
+	break;
+
+    case 2:
+	*(uint16_t *)src = value;
+	break;
+
+    case 4:
+	*(uint32_t *)src = value;
+	break;
+
+    default:
+        break;
+    }
+}
+
+int
+main (int argc, char **argv)
+{
+#define d2f pixman_double_to_fixed
+    
+    GtkWidget *window, *swindow;
+    GtkWidget *table;
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *src = malloc (WIDTH * HEIGHT * 4);
+    pixman_image_t *src_img;
+    pixman_image_t *dest_img;
+    pixman_point_fixed_t p1 = { -10 << 0, 0 };
+    pixman_point_fixed_t p2 = { WIDTH << 16, (HEIGHT - 10) << 16 };
+    uint16_t full = 0xcfff;
+    uint16_t low  = 0x5000;
+    uint16_t alpha = 0xffff;
+    pixman_gradient_stop_t stops[6] =
+    {
+	{ d2f (0.0), { full, low, low, alpha } },
+	{ d2f (0.25), { full, full, low, alpha } },
+	{ d2f (0.4), { low, full, low, alpha } },
+	{ d2f (0.6), { low, full, full, alpha } },
+	{ d2f (0.8), { low, low, full, alpha } },
+	{ d2f (1.0), { full, low, full, alpha } },
+    };
+
+    int i;
+
+    gtk_init (&argc, &argv);
+
+    window = gtk_window_new (GTK_WINDOW_TOPLEVEL);
+
+    gtk_window_set_default_size (GTK_WINDOW (window), 800, 600);
+    
+    g_signal_connect (window, "delete-event",
+		      G_CALLBACK (gtk_main_quit),
+		      NULL);
+    table = gtk_table_new (G_N_ELEMENTS (operators) / 6, 6, TRUE);
+
+    src_img = pixman_image_create_linear_gradient (&p1, &p2, stops,
+						   sizeof (stops) / sizeof (stops[0]));
+
+    pixman_image_set_repeat (src_img, PIXMAN_REPEAT_PAD);
+    
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					 WIDTH, HEIGHT,
+					 dest,
+					 WIDTH * 4);
+    pixman_image_set_accessors (dest_img, reader, writer);
+
+    for (i = 0; i < G_N_ELEMENTS (operators); ++i)
+    {
+	GtkWidget *image;
+	GdkPixbuf *pixbuf;
+	GtkWidget *vbox;
+	GtkWidget *label;
+	int j, k;
+
+	vbox = gtk_vbox_new (FALSE, 0);
+
+	label = gtk_label_new (operators[i].name);
+	gtk_box_pack_start (GTK_BOX (vbox), label, FALSE, FALSE, 6);
+	gtk_widget_show (label);
+
+	for (j = 0; j < HEIGHT; ++j)
+	{
+	    for (k = 0; k < WIDTH; ++k)
+		dest[j * WIDTH + k] = 0x7f6f6f00;
+	}
+	pixman_image_composite (operators[i].op, src_img, NULL, dest_img,
+				0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+	pixbuf = pixbuf_from_argb32 (pixman_image_get_data (dest_img), TRUE,
+				     WIDTH, HEIGHT, WIDTH * 4);
+	image = gtk_image_new_from_pixbuf (pixbuf);
+	gtk_box_pack_start (GTK_BOX (vbox), image, FALSE, FALSE, 0);
+	gtk_widget_show (image);
+
+	gtk_table_attach_defaults (GTK_TABLE (table), vbox,
+				   i % 6, (i % 6) + 1, i / 6, (i / 6) + 1);
+	gtk_widget_show (vbox);
+
+	g_object_unref (pixbuf);
+    }
+
+    pixman_image_unref (src_img);
+    free (src);
+    pixman_image_unref (dest_img);
+    free (dest);
+
+    swindow = gtk_scrolled_window_new (NULL, NULL);
+    gtk_scrolled_window_set_policy (GTK_SCROLLED_WINDOW (swindow),
+				    GTK_POLICY_AUTOMATIC,
+				    GTK_POLICY_AUTOMATIC);
+    
+    gtk_scrolled_window_add_with_viewport (GTK_SCROLLED_WINDOW (swindow), table);
+    gtk_widget_show (table);
+
+    gtk_container_add (GTK_CONTAINER (window), swindow);
+    gtk_widget_show (swindow);
+
+    gtk_widget_show (window);
+
+    gtk_main ();
+
+    return 0;
+}
diff --git a/pixman/demos/convolution-test.c b/pixman/demos/convolution-test.c
new file mode 100644
index 000000000..da284af7b
--- /dev/null
+++ b/pixman/demos/convolution-test.c
@@ -0,0 +1,47 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 200
+#define HEIGHT 200
+
+#define d2f pixman_double_to_fixed
+    
+    uint32_t *src = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *mask = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    pixman_fixed_t convolution[] =
+    {
+	d2f (3), d2f (3),
+	d2f (0.5), d2f (0.5), d2f (0.5),
+	d2f (0.5), d2f (0.5), d2f (0.5),
+	d2f (0.5), d2f (0.5), d2f (0.5),
+    };
+    pixman_image_t *simg, *mimg, *dimg;
+
+    int i;
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+    {
+	src[i] = 0x7f007f00;
+	mask[i] = (i % 256) * 0x01000000;
+	dest[i] = 0;
+    }
+
+    simg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src, WIDTH * 4);
+    mimg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, mask, WIDTH * 4);
+    dimg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, dest, WIDTH * 4);
+
+    pixman_image_set_filter (mimg, PIXMAN_FILTER_CONVOLUTION,
+			     convolution, 11);
+
+    pixman_image_composite (PIXMAN_OP_OVER, simg, mimg, dimg, 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+    show_image (dimg);
+    
+    return 0;
+}
diff --git a/pixman/demos/gradient-test.c b/pixman/demos/gradient-test.c
new file mode 100644
index 000000000..fc84844b0
--- /dev/null
+++ b/pixman/demos/gradient-test.c
@@ -0,0 +1,89 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 400
+#define HEIGHT 200
+    
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    pixman_image_t *src_img;
+    pixman_image_t *dest_img;
+    int i;
+    pixman_gradient_stop_t stops[2] =
+	{
+	    { pixman_int_to_fixed (0), { 0xffff, 0xeeee, 0xeeee, 0xeeee } },
+	    { pixman_int_to_fixed (1), { 0xffff, 0x1111, 0x1111, 0x1111 } }
+	};
+    pixman_point_fixed_t p1 = { pixman_double_to_fixed (0), 0 };
+    pixman_point_fixed_t p2 = { pixman_double_to_fixed (WIDTH / 8.),
+				pixman_int_to_fixed (0) };
+#if 0
+    pixman_transform_t trans = {
+	{ { pixman_double_to_fixed (2), pixman_double_to_fixed (0.5), pixman_double_to_fixed (-100), },
+	  { pixman_double_to_fixed (0), pixman_double_to_fixed (3), pixman_double_to_fixed (0), },
+	  { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } 
+	}
+    };
+#else
+    pixman_transform_t trans = {
+	{ { pixman_fixed_1, 0, 0 },
+	  { 0, pixman_fixed_1, 0 },
+	  { 0, 0, pixman_fixed_1 } }
+    };
+#endif
+
+    pixman_point_fixed_t c_inner;
+    pixman_point_fixed_t c_outer;
+    pixman_fixed_t r_inner;
+    pixman_fixed_t r_outer;
+    
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+	dest[i] = 0x4f00004f; /* pale blue */
+    
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					 WIDTH, HEIGHT, 
+					 dest,
+					 WIDTH * 4);
+
+    c_inner.x = pixman_double_to_fixed (50.0);
+    c_inner.y = pixman_double_to_fixed (50.0);
+    c_outer.x = pixman_double_to_fixed (50.0);
+    c_outer.y = pixman_double_to_fixed (50.0);
+    r_inner = 0;
+    r_outer = pixman_double_to_fixed (50.0);
+    
+    src_img = pixman_image_create_conical_gradient (&c_inner, r_inner,
+						    stops, 2);
+#if 0
+    src_img = pixman_image_create_conical_gradient (&c_inner, r_inner,
+						    stops, 2);
+    src_img = pixman_image_create_linear_gradient (&c_inner, &c_outer,
+						   r_inner, r_outer,
+						   stops, 2);
+#endif
+    
+    src_img = pixman_image_create_linear_gradient  (&p1, &p2,
+						    stops, 2);
+    
+    pixman_image_set_transform (src_img, &trans);
+    pixman_image_set_repeat (src_img, PIXMAN_REPEAT_PAD);
+    
+    pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dest_img,
+			    0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT);
+    
+    printf ("0, 0: %x\n", dest[0]);
+    printf ("10, 10: %x\n", dest[10 * 10 + 10]);
+    printf ("w, h: %x\n", dest[(HEIGHT - 1) * 100 + (WIDTH - 1)]);
+    
+    show_image (dest_img);
+    
+    pixman_image_unref (src_img);
+    pixman_image_unref (dest_img);
+    free (dest);
+    
+    return 0;
+}
diff --git a/pixman/demos/gtk-utils.c b/pixman/demos/gtk-utils.c
new file mode 100644
index 000000000..f45cdc912
--- /dev/null
+++ b/pixman/demos/gtk-utils.c
@@ -0,0 +1,115 @@
+#include <gtk/gtk.h>
+#include <config.h>
+#include "pixman-private.h"	/* For image->bits.format
+				 * FIXME: there should probably be public API for this
+				 */
+#include "gtk-utils.h"
+
+GdkPixbuf *
+pixbuf_from_argb32 (uint32_t *bits,
+		    gboolean has_alpha,
+		    int width,
+		    int height,
+		    int stride)
+{
+    GdkPixbuf *pixbuf = gdk_pixbuf_new (GDK_COLORSPACE_RGB, TRUE,
+					8, width, height);
+    int p_stride = gdk_pixbuf_get_rowstride (pixbuf);
+    guint32 *p_bits = (guint32 *)gdk_pixbuf_get_pixels (pixbuf);
+    int w, h;
+    
+    for (h = 0; h < height; ++h)
+    {
+	for (w = 0; w < width; ++w)
+	{
+	    uint32_t argb = bits[h * (stride / 4) + w];
+	    guint r, g, b, a;
+	    char *pb = (char *)p_bits;
+
+	    pb += h * p_stride + w * 4;
+
+	    r = (argb & 0x00ff0000) >> 16;
+	    g = (argb & 0x0000ff00) >> 8;
+	    b = (argb & 0x000000ff) >> 0;
+	    a = has_alpha? (argb & 0xff000000) >> 24 : 0xff;
+
+	    if (a)
+	    {
+		r = (r * 255) / a;
+		g = (g * 255) / a;
+		b = (b * 255) / a;
+	    }
+
+	    if (r > 255) r = 255;
+	    if (g > 255) g = 255;
+	    if (b > 255) b = 255;
+	    
+	    pb[0] = r;
+	    pb[1] = g;
+	    pb[2] = b;
+	    pb[3] = a;
+	}
+    }
+    
+    return pixbuf;
+}
+
+
+static gboolean
+on_expose (GtkWidget *widget, GdkEventExpose *expose, gpointer data)
+{
+    GdkPixbuf *pixbuf = data;
+    
+    gdk_draw_pixbuf (widget->window, NULL,
+		     pixbuf, 0, 0, 0, 0,
+		     gdk_pixbuf_get_width (pixbuf),
+		     gdk_pixbuf_get_height (pixbuf),
+		     GDK_RGB_DITHER_NONE,
+		     0, 0);
+    
+    return TRUE;
+}
+
+void
+show_image (pixman_image_t *image)
+{
+    GtkWidget *window;
+    GdkPixbuf *pixbuf;
+    int width, height, stride;
+    int argc;
+    char **argv;
+    char *arg0 = g_strdup ("pixman-test-program");
+    gboolean has_alpha;
+    pixman_format_code_t format;
+
+    argc = 1;
+    argv = (char **)&arg0;
+
+    gtk_init (&argc, &argv);
+    
+    window = gtk_window_new (GTK_WINDOW_TOPLEVEL);
+    width = pixman_image_get_width (image);
+    height = pixman_image_get_height (image);
+    stride = pixman_image_get_stride (image);
+
+    gtk_window_set_default_size (GTK_WINDOW (window), width, height);
+    
+    format = image->bits.format;
+    
+    if (format == PIXMAN_a8r8g8b8)
+	has_alpha = TRUE;
+    else if (format == PIXMAN_x8r8g8b8)
+	has_alpha = FALSE;
+    else
+	g_error ("Can't deal with this format: %x\n", format);
+    
+    pixbuf = pixbuf_from_argb32 (pixman_image_get_data (image), has_alpha,
+				 width, height, stride);
+    
+    g_signal_connect (window, "expose_event", G_CALLBACK (on_expose), pixbuf);
+    g_signal_connect (window, "delete_event", G_CALLBACK (gtk_main_quit), NULL);
+    
+    gtk_widget_show (window);
+    
+    gtk_main ();
+}
diff --git a/pixman/demos/gtk-utils.h b/pixman/demos/gtk-utils.h
new file mode 100644
index 000000000..2cb13bcf0
--- /dev/null
+++ b/pixman/demos/gtk-utils.h
@@ -0,0 +1,13 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <glib.h>
+#include <gtk/gtk.h>
+#include "pixman.h"
+
+void show_image (pixman_image_t *image);
+
+GdkPixbuf *pixbuf_from_argb32 (uint32_t *bits,
+		               gboolean has_alpha,
+                               int width,
+                               int height,
+                               int stride);
diff --git a/pixman/demos/radial-test.c b/pixman/demos/radial-test.c
new file mode 100644
index 000000000..35e90d786
--- /dev/null
+++ b/pixman/demos/radial-test.c
@@ -0,0 +1,198 @@
+#include "../test/utils.h"
+#include "gtk-utils.h"
+
+#define NUM_GRADIENTS 7
+#define NUM_STOPS 3
+#define NUM_REPEAT 4
+#define SIZE 128
+#define WIDTH (SIZE * NUM_GRADIENTS)
+#define HEIGHT (SIZE * NUM_REPEAT)
+
+/*
+ * We want to test all the possible relative positions of the start
+ * and end circle:
+ *
+ *  - The start circle can be smaller/equal/bigger than the end
+ *    circle. A radial gradient can be classified in one of these
+ *    three cases depending on the sign of dr.
+ *
+ *  - The smaller circle can be completely inside/internally
+ *    tangent/outside (at least in part) of the bigger circle. This
+ *    classification is the same as the one which can be computed by
+ *    examining the sign of a = (dx^2 + dy^2 - dr^2).
+ *
+ *  - If the two circles have the same size, neither can be inside or
+ *    internally tangent
+ *
+ * This test draws radial gradients whose circles always have the same
+ * centers (0, 0) and (1, 0), but with different radiuses. From left
+ * to right:
+ *
+ * - Small start circle completely inside the end circle
+ *     0.25 -> 1.75; dr =  1.5 > 0; a = 1 - 1.50^2 < 0
+ *
+ * - Small start circle internally tangent to the end circle
+ *     0.50 -> 1.50; dr =  1.0 > 0; a = 1 - 1.00^2 = 0
+ *
+ * - Small start circle outside of the end circle
+ *     0.50 -> 1.00; dr =  0.5 > 0; a = 1 - 0.50^2 > 0
+ *
+ * - Start circle with the same size as the end circle
+ *     1.00 -> 1.00; dr =  0.0 = 0; a = 1 - 0.00^2 > 0
+ *
+ * - Small end circle outside of the start circle
+ *     1.00 -> 0.50; dr = -0.5 > 0; a = 1 - 0.50^2 > 0
+ *
+ * - Small end circle internally tangent to the start circle
+ *     1.50 -> 0.50; dr = -1.0 > 0; a = 1 - 1.00^2 = 0
+ *
+ * - Small end circle completely inside the start circle
+ *     1.75 -> 0.25; dr = -1.5 > 0; a = 1 - 1.50^2 < 0
+ *
+ */
+
+const static double radiuses[NUM_GRADIENTS] = {
+    0.25,
+    0.50,
+    0.50,
+    1.00,
+    1.00,
+    1.50,
+    1.75
+};
+
+#define double_to_color(x)					\
+    (((uint32_t) ((x)*65536)) - (((uint32_t) ((x)*65536)) >> 16))
+
+#define PIXMAN_STOP(offset,r,g,b,a)		\
+    { pixman_double_to_fixed (offset),		\
+	{					\
+	double_to_color (r),			\
+	double_to_color (g),			\
+	double_to_color (b),			\
+	double_to_color (a)			\
+	}					\
+    }
+
+static const pixman_gradient_stop_t stops[NUM_STOPS] = {
+    PIXMAN_STOP (0.0,        1, 0, 0, 0.75),
+    PIXMAN_STOP (0.70710678, 0, 1, 0, 0),
+    PIXMAN_STOP (1.0,        0, 0, 1, 1)
+};
+
+static pixman_image_t *
+create_radial (int index)
+{
+    pixman_point_fixed_t p0, p1;
+    pixman_fixed_t r0, r1;
+    double x0, x1, radius0, radius1, left, right, center;
+
+    x0 = 0;
+    x1 = 1;
+    radius0 = radiuses[index];
+    radius1 = radiuses[NUM_GRADIENTS - index - 1];
+
+    /* center the gradient */
+    left = MIN (x0 - radius0, x1 - radius1);
+    right = MAX (x0 + radius0, x1 + radius1);
+    center = (left + right) * 0.5;
+    x0 -= center;
+    x1 -= center;
+
+    /* scale to make it fit within a 1x1 rect centered in (0,0) */
+    x0 *= 0.25;
+    x1 *= 0.25;
+    radius0 *= 0.25;
+    radius1 *= 0.25;
+
+    p0.x = pixman_double_to_fixed (x0);
+    p0.y = pixman_double_to_fixed (0);
+
+    p1.x = pixman_double_to_fixed (x1);
+    p1.y = pixman_double_to_fixed (0);
+
+    r0 = pixman_double_to_fixed (radius0);
+    r1 = pixman_double_to_fixed (radius1);
+
+    return pixman_image_create_radial_gradient (&p0, &p1,
+						r0, r1,
+						stops, NUM_STOPS);
+}
+
+static const pixman_repeat_t repeat[NUM_REPEAT] = {
+    PIXMAN_REPEAT_NONE,
+    PIXMAN_REPEAT_NORMAL,
+    PIXMAN_REPEAT_REFLECT,
+    PIXMAN_REPEAT_PAD
+};
+
+int
+main (int argc, char **argv)
+{
+    pixman_transform_t transform;
+    pixman_image_t *src_img, *dest_img;
+    int i, j;
+
+    enable_fp_exceptions ();
+
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					 WIDTH, HEIGHT,
+					 NULL, 0);
+
+    pixman_transform_init_identity (&transform);
+
+    /*
+     * The create_radial() function returns gradients centered in the
+     * origin and whose interesting part fits a 1x1 square. We want to
+     * paint these gradients on a SIZExSIZE square and to make things
+     * easier we want the origin in the top-left corner of the square
+     * we want to see.
+     */
+    pixman_transform_translate (NULL, &transform,
+				pixman_double_to_fixed (0.5),
+				pixman_double_to_fixed (0.5));
+
+    pixman_transform_scale (NULL, &transform,
+			    pixman_double_to_fixed (SIZE),
+			    pixman_double_to_fixed (SIZE));
+
+    /*
+     * Gradients are evaluated at the center of each pixel, so we need
+     * to translate by half a pixel to trigger some interesting
+     * cornercases. In particular, the original implementation of PDF
+     * radial gradients tried to divide by 0 when using this transform
+     * on the "tangent circles" cases.
+     */
+    pixman_transform_translate (NULL, &transform,
+				pixman_double_to_fixed (0.5),
+				pixman_double_to_fixed (0.5));
+
+    for (i = 0; i < NUM_GRADIENTS; i++)
+    {
+	src_img = create_radial (i);
+	pixman_image_set_transform (src_img, &transform);
+
+	for (j = 0; j < NUM_REPEAT; j++)
+	{
+	    pixman_image_set_repeat (src_img, repeat[j]);
+
+	    pixman_image_composite32 (PIXMAN_OP_OVER,
+				      src_img,
+				      NULL,
+				      dest_img,
+				      0, 0,
+				      0, 0,
+				      i * SIZE, j * SIZE,
+				      SIZE, SIZE);
+
+	}
+
+	pixman_image_unref (src_img);
+    }
+
+    show_image (dest_img);
+
+    pixman_image_unref (dest_img);
+
+    return 0;
+}
diff --git a/pixman/demos/screen-test.c b/pixman/demos/screen-test.c
new file mode 100644
index 000000000..e69dba3de
--- /dev/null
+++ b/pixman/demos/screen-test.c
@@ -0,0 +1,44 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 40
+#define HEIGHT 40
+    
+    uint32_t *src1 = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *src2 = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *src3 = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *dest = malloc (3 * WIDTH * 2 * HEIGHT * 4);
+    pixman_image_t *simg1, *simg2, *simg3, *dimg;
+
+    int i;
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+    {
+	src1[i] = 0x7ff00000;
+	src2[i] = 0x7f00ff00;
+	src3[i] = 0x7f0000ff;
+    }
+
+    for (i = 0; i < 3 * WIDTH * 2 * HEIGHT; ++i)
+    {
+	dest[i] = 0x0;
+    }
+
+    simg1 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src1, WIDTH * 4);
+    simg2 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src2, WIDTH * 4);
+    simg3 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src3, WIDTH * 4);
+    dimg  = pixman_image_create_bits (PIXMAN_a8r8g8b8, 3 * WIDTH, 2 * HEIGHT, dest, 3 * WIDTH * 4);
+
+    pixman_image_composite (PIXMAN_OP_SCREEN, simg1, NULL, dimg, 0, 0, 0, 0, WIDTH, HEIGHT / 4, WIDTH, HEIGHT);
+    pixman_image_composite (PIXMAN_OP_SCREEN, simg2, NULL, dimg, 0, 0, 0, 0, (WIDTH/2), HEIGHT / 4 + HEIGHT / 2, WIDTH, HEIGHT);
+    pixman_image_composite (PIXMAN_OP_SCREEN, simg3, NULL, dimg, 0, 0, 0, 0, (4 * WIDTH) / 3, HEIGHT, WIDTH, HEIGHT);
+
+    show_image (dimg);
+    
+    return 0;
+}
diff --git a/pixman/demos/trap-test.c b/pixman/demos/trap-test.c
new file mode 100644
index 000000000..19295e7a5
--- /dev/null
+++ b/pixman/demos/trap-test.c
@@ -0,0 +1,49 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 200
+#define HEIGHT 200
+
+    pixman_image_t *src_img;
+    pixman_image_t *mask_img;
+    pixman_image_t *dest_img;
+    pixman_trap_t trap;
+    pixman_color_t white = { 0x0000, 0xffff, 0x0000, 0xffff };
+    uint32_t *bits = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *mbits = malloc (WIDTH * HEIGHT);
+
+    memset (mbits, 0, WIDTH * HEIGHT);
+    memset (bits, 0xff, WIDTH * HEIGHT * 4);
+    
+    trap.top.l = pixman_int_to_fixed (50) + 0x8000;
+    trap.top.r = pixman_int_to_fixed (150) + 0x8000;
+    trap.top.y = pixman_int_to_fixed (30);
+
+    trap.bot.l = pixman_int_to_fixed (50) + 0x8000;
+    trap.bot.r = pixman_int_to_fixed (150) + 0x8000;
+    trap.bot.y = pixman_int_to_fixed (150);
+
+    mask_img = pixman_image_create_bits (PIXMAN_a8, WIDTH, HEIGHT, mbits, WIDTH);
+    src_img = pixman_image_create_solid_fill (&white);
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, WIDTH * 4);
+    
+    pixman_add_traps (mask_img, 0, 0, 1, &trap);
+
+    pixman_image_composite (PIXMAN_OP_OVER,
+			    src_img, mask_img, dest_img,
+			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+    
+    show_image (dest_img);
+    
+    pixman_image_unref (src_img);
+    pixman_image_unref (dest_img);
+    free (bits);
+    
+    return 0;
+}
diff --git a/pixman/pixman/pixman-arm-common.h b/pixman/pixman/pixman-arm-common.h
index 372e9f9a8..9b1322b6c 100644
--- a/pixman/pixman/pixman-arm-common.h
+++ b/pixman/pixman/pixman-arm-common.h
@@ -282,19 +282,20 @@ cputype##_composite_##name (pixman_implementation_t *imp,               \
                                                src_type, dst_type)            \
 void                                                                          \
 pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
-                                                       int32_t        w,      \
-                                                       dst_type *     dst,    \
-                                                       src_type *     src,    \
-                                                       pixman_fixed_t vx,     \
-                                                       pixman_fixed_t unit_x);\
+                                                   int32_t          w,        \
+                                                   dst_type *       dst,      \
+                                                   const src_type * src,      \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x);  \
                                                                               \
 static force_inline void                                                      \
 scaled_nearest_scanline_##cputype##_##name##_##op (dst_type *       pd,       \
-                                                   src_type *       ps,       \
+                                                   const src_type * ps,       \
                                                    int32_t          w,        \
                                                    pixman_fixed_t   vx,       \
                                                    pixman_fixed_t   unit_x,   \
-                                                   pixman_fixed_t   max_vx)   \
+                                                   pixman_fixed_t   max_vx,   \
+                                                   pixman_bool_t    zero_src) \
 {                                                                             \
     pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
                                                                   vx, unit_x);\
@@ -316,4 +317,48 @@ FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op,                           \
     SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),                              \
     SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func)
 
+#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op,   \
+                                                  src_type, dst_type)         \
+void                                                                          \
+pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
+                                                   int32_t          w,        \
+                                                   dst_type *       dst,      \
+                                                   const src_type * src,      \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x,   \
+                                                   const uint8_t *  mask);    \
+                                                                              \
+static force_inline void                                                      \
+scaled_nearest_scanline_##cputype##_##name##_##op (const uint8_t *  mask,     \
+                                                   dst_type *       pd,       \
+                                                   const src_type * ps,       \
+                                                   int32_t          w,        \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x,   \
+                                                   pixman_fixed_t   max_vx,   \
+                                                   pixman_bool_t    zero_src) \
+{                                                                             \
+    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
+	return;                                                               \
+    pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
+                                                                  vx, unit_x, \
+                                                                  mask);      \
+}                                                                             \
+                                                                              \
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                  \
+                              scaled_nearest_scanline_##cputype##_##name##_##op,\
+                              src_type, uint8_t, dst_type, COVER, TRUE, FALSE)\
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_none_##op,                   \
+                              scaled_nearest_scanline_##cputype##_##name##_##op,\
+                              src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                    \
+                              scaled_nearest_scanline_##cputype##_##name##_##op,\
+                              src_type, uint8_t, dst_type, PAD, TRUE, FALSE)
+
+/* Provide entries for the fast path table */
+#define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)              \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),                     \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),                      \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+
 #endif
diff --git a/pixman/pixman/pixman-arm-neon-asm.S b/pixman/pixman/pixman-arm-neon-asm.S
index 51533d42f..47daf457c 100644
--- a/pixman/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman/pixman-arm-neon-asm.S
@@ -1,2365 +1,2393 @@
-/*
- * Copyright © 2009 Nokia Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
- */
-
-/*
- * This file contains implementations of NEON optimized pixel processing
- * functions. There is no full and detailed tutorial, but some functions
- * (those which are exposing some new or interesting features) are
- * extensively commented and can be used as examples.
- *
- * You may want to have a look at the comments for following functions:
- *  - pixman_composite_over_8888_0565_asm_neon
- *  - pixman_composite_over_n_8_0565_asm_neon
- */
-
-/* Prevent the stack from becoming executable for no reason... */
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
-    .text
-    .fpu neon
-    .arch armv7a
-    .object_arch armv4
-    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
-    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
-    .arm
-    .altmacro
-
-#include "pixman-arm-neon-asm.h"
-
-/* Global configuration options and preferences */
-
-/*
- * The code can optionally make use of unaligned memory accesses to improve
- * performance of handling leading/trailing pixels for each scanline.
- * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
- * example in linux if unaligned memory accesses are not configured to
- * generate.exceptions.
- */
-.set RESPECT_STRICT_ALIGNMENT, 1
-
-/*
- * Set default prefetch type. There is a choice between the following options:
- *
- * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
- * as NOP to workaround some HW bugs or for whatever other reason)
- *
- * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
- * advanced prefetch intruduces heavy overhead)
- *
- * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
- * which can run ARM and NEON instructions simultaneously so that extra ARM
- * instructions do not add (many) extra cycles, but improve prefetch efficiency)
- *
- * Note: some types of function can't support advanced prefetch and fallback
- *       to simple one (those which handle 24bpp pixels)
- */
-.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
-
-/* Prefetch distance in pixels for simple prefetch */
-.set PREFETCH_DISTANCE_SIMPLE, 64
-
-/*
- * Implementation of pixman_composite_over_8888_0565_asm_neon
- *
- * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
- * performs OVER compositing operation. Function fast_composite_over_8888_0565
- * from pixman-fast-path.c does the same in C and can be used as a reference.
- *
- * First we need to have some NEON assembly code which can do the actual
- * operation on the pixels and provide it to the template macro.
- *
- * Template macro quite conveniently takes care of emitting all the necessary
- * code for memory reading and writing (including quite tricky cases of
- * handling unaligned leading/trailing pixels), so we only need to deal with
- * the data in NEON registers.
- *
- * NEON registers allocation in general is recommented to be the following:
- * d0,  d1,  d2,  d3  - contain loaded source pixel data
- * d4,  d5,  d6,  d7  - contain loaded destination pixels (if they are needed)
- * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
- * d28, d29, d30, d31 - place for storing the result (destination pixels)
- *
- * As can be seen above, four 64-bit NEON registers are used for keeping
- * intermediate pixel data and up to 8 pixels can be processed in one step
- * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
- *
- * This particular function uses the following registers allocation:
- * d0,  d1,  d2,  d3  - contain loaded source pixel data
- * d4,  d5            - contain loaded destination pixels (they are needed)
- * d28, d29           - place for storing the result (destination pixels)
- */
-
-/*
- * Step one. We need to have some code to do some arithmetics on pixel data.
- * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
- * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
- * perform all the needed calculations and write the result to {d28, d29}.
- * The rationale for having two macros and not just one will be explained
- * later. In practice, any single monolitic function which does the work can
- * be split into two parts in any arbitrary way without affecting correctness.
- *
- * There is one special trick here too. Common template macro can optionally
- * make our life a bit easier by doing R, G, B, A color components
- * deinterleaving for 32bpp pixel formats (and this feature is used in
- * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
- * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
- * actually use d0 register for blue channel (a vector of eight 8-bit
- * values), d1 register for green, d2 for red and d3 for alpha. This
- * simple conversion can be also done with a few NEON instructions:
- *
- * Packed to planar conversion:
- *  vuzp.8 d0, d1
- *  vuzp.8 d2, d3
- *  vuzp.8 d1, d3
- *  vuzp.8 d0, d2
- *
- * Planar to packed conversion:
- *  vzip.8 d0, d2
- *  vzip.8 d1, d3
- *  vzip.8 d2, d3
- *  vzip.8 d0, d1
- *
- * But pixel can be loaded directly in planar format using VLD4.8 NEON
- * instruction. It is 1 cycle slower than VLD1.32, so this is not always
- * desirable, that's why deinterleaving is optional.
- *
- * But anyway, here is the code:
- */
-.macro pixman_composite_over_8888_0565_process_pixblock_head
-    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
-       and put data into d6 - red, d7 - green, d30 - blue */
-    vshrn.u16   d6, q2, #8
-    vshrn.u16   d7, q2, #3
-    vsli.u16    q2, q2, #5
-    vsri.u8     d6, d6, #5
-    vmvn.8      d3, d3      /* invert source alpha */
-    vsri.u8     d7, d7, #6
-    vshrn.u16   d30, q2, #2
-    /* now do alpha blending, storing results in 8-bit planar format
-       into d16 - red, d19 - green, d18 - blue */
-    vmull.u8    q10, d3, d6
-    vmull.u8    q11, d3, d7
-    vmull.u8    q12, d3, d30
-    vrshr.u16   q13, q10, #8
-    vrshr.u16   q3, q11, #8
-    vrshr.u16   q15, q12, #8
-    vraddhn.u16 d20, q10, q13
-    vraddhn.u16 d23, q11, q3
-    vraddhn.u16 d22, q12, q15
-.endm
-
-.macro pixman_composite_over_8888_0565_process_pixblock_tail
-    /* ... continue alpha blending */
-    vqadd.u8    d16, d2, d20
-    vqadd.u8    q9, q0, q11
-    /* convert the result to r5g6b5 and store it into {d28, d29} */
-    vshll.u8    q14, d16, #8
-    vshll.u8    q8, d19, #8
-    vshll.u8    q9, d18, #8
-    vsri.u16    q14, q8, #5
-    vsri.u16    q14, q9, #11
-.endm
-
-/*
- * OK, now we got almost everything that we need. Using the above two
- * macros, the work can be done right. But now we want to optimize
- * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
- * a lot from good code scheduling and software pipelining.
- *
- * Let's construct some code, which will run in the core main loop.
- * Some pseudo-code of the main loop will look like this:
- *   head
- *   while (...) {
- *     tail
- *     head
- *   }
- *   tail
- *
- * It may look a bit weird, but this setup allows to hide instruction
- * latencies better and also utilize dual-issue capability more
- * efficiently (make pairs of load-store and ALU instructions).
- *
- * So what we need now is a '*_tail_head' macro, which will be used
- * in the core main loop. A trivial straightforward implementation
- * of this macro would look like this:
- *
- *   pixman_composite_over_8888_0565_process_pixblock_tail
- *   vst1.16     {d28, d29}, [DST_W, :128]!
- *   vld1.16     {d4, d5}, [DST_R, :128]!
- *   vld4.32     {d0, d1, d2, d3}, [SRC]!
- *   pixman_composite_over_8888_0565_process_pixblock_head
- *   cache_preload 8, 8
- *
- * Now it also got some VLD/VST instructions. We simply can't move from
- * processing one block of pixels to the other one with just arithmetics.
- * The previously processed data needs to be written to memory and new
- * data needs to be fetched. Fortunately, this main loop does not deal
- * with partial leading/trailing pixels and can load/store a full block
- * of pixels in a bulk. Additionally, destination buffer is already
- * 16 bytes aligned here (which is good for performance).
- *
- * New things here are DST_R, DST_W, SRC and MASK identifiers. These
- * are the aliases for ARM registers which are used as pointers for
- * accessing data. We maintain separate pointers for reading and writing
- * destination buffer (DST_R and DST_W).
- *
- * Another new thing is 'cache_preload' macro. It is used for prefetching
- * data into CPU L2 cache and improve performance when dealing with large
- * images which are far larger than cache size. It uses one argument
- * (actually two, but they need to be the same here) - number of pixels
- * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
- * details about this macro. Moreover, if good performance is needed
- * the code from this macro needs to be copied into '*_tail_head' macro
- * and mixed with the rest of code for optimal instructions scheduling.
- * We are actually doing it below.
- *
- * Now after all the explanations, here is the optimized code.
- * Different instruction streams (originaling from '*_head', '*_tail'
- * and 'cache_preload' macro) use different indentation levels for
- * better readability. Actually taking the code from one of these
- * indentation levels and ignoring a few VLD/VST instructions would
- * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
- * macro!
- */
-
-#if 1
-
-.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
-        vqadd.u8    d16, d2, d20
-    vld1.16     {d4, d5}, [DST_R, :128]!
-        vqadd.u8    q9, q0, q11
-    vshrn.u16   d6, q2, #8
-    fetch_src_pixblock
-    vshrn.u16   d7, q2, #3
-    vsli.u16    q2, q2, #5
-        vshll.u8    q14, d16, #8
-                                    PF add PF_X, PF_X, #8
-        vshll.u8    q8, d19, #8
-                                    PF tst PF_CTL, #0xF
-    vsri.u8     d6, d6, #5
-                                    PF addne PF_X, PF_X, #8
-    vmvn.8      d3, d3
-                                    PF subne PF_CTL, PF_CTL, #1
-    vsri.u8     d7, d7, #6
-    vshrn.u16   d30, q2, #2
-    vmull.u8    q10, d3, d6
-                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-    vmull.u8    q11, d3, d7
-    vmull.u8    q12, d3, d30
-                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
-        vsri.u16    q14, q8, #5
-                                    PF cmp PF_X, ORIG_W
-        vshll.u8    q9, d18, #8
-    vrshr.u16   q13, q10, #8
-                                    PF subge PF_X, PF_X, ORIG_W
-    vrshr.u16   q3, q11, #8
-    vrshr.u16   q15, q12, #8
-                                    PF subges PF_CTL, PF_CTL, #0x10
-        vsri.u16    q14, q9, #11
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-    vraddhn.u16 d20, q10, q13
-    vraddhn.u16 d23, q11, q3
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
-    vraddhn.u16 d22, q12, q15
-        vst1.16     {d28, d29}, [DST_W, :128]!
-.endm
-
-#else
-
-/* If we did not care much about the performance, we would just use this... */
-.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
-    pixman_composite_over_8888_0565_process_pixblock_tail
-    vst1.16     {d28, d29}, [DST_W, :128]!
-    vld1.16     {d4, d5}, [DST_R, :128]!
-    fetch_src_pixblock
-    pixman_composite_over_8888_0565_process_pixblock_head
-    cache_preload 8, 8
-.endm
-
-#endif
-
-/*
- * And now the final part. We are using 'generate_composite_function' macro
- * to put all the stuff together. We are specifying the name of the function
- * which we want to get, number of bits per pixel for the source, mask and
- * destination (0 if unused, like mask in this case). Next come some bit
- * flags:
- *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
- *                             and written, for write-only buffer we would use
- *                             FLAG_DST_WRITEONLY flag instead
- *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
- *                             and separate color channels for 32bpp format.
- * The next things are:
- *  - the number of pixels processed per iteration (8 in this case, because
- *    that's the maximum what can fit into four 64-bit NEON registers).
- *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
- *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
- *    prefetch distance can be selected by running some benchmarks.
- *
- * After that we specify some macros, these are 'default_init',
- * 'default_cleanup' here which are empty (but it is possible to have custom
- * init/cleanup macros to be able to save/restore some extra NEON registers
- * like d8-d15 or do anything else) followed by
- * 'pixman_composite_over_8888_0565_process_pixblock_head',
- * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
- * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
- * which we got implemented above.
- *
- * The last part is the NEON registers allocation scheme.
- */
-generate_composite_function \
-    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_over_8888_0565_process_pixblock_head, \
-    pixman_composite_over_8888_0565_process_pixblock_tail, \
-    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    24  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_n_0565_process_pixblock_head
-    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
-       and put data into d6 - red, d7 - green, d30 - blue */
-    vshrn.u16   d6, q2, #8
-    vshrn.u16   d7, q2, #3
-    vsli.u16    q2, q2, #5
-    vsri.u8     d6, d6, #5
-    vsri.u8     d7, d7, #6
-    vshrn.u16   d30, q2, #2
-    /* now do alpha blending, storing results in 8-bit planar format
-       into d16 - red, d19 - green, d18 - blue */
-    vmull.u8    q10, d3, d6
-    vmull.u8    q11, d3, d7
-    vmull.u8    q12, d3, d30
-    vrshr.u16   q13, q10, #8
-    vrshr.u16   q3, q11, #8
-    vrshr.u16   q15, q12, #8
-    vraddhn.u16 d20, q10, q13
-    vraddhn.u16 d23, q11, q3
-    vraddhn.u16 d22, q12, q15
-.endm
-
-.macro pixman_composite_over_n_0565_process_pixblock_tail
-    /* ... continue alpha blending */
-    vqadd.u8    d16, d2, d20
-    vqadd.u8    q9, q0, q11
-    /* convert the result to r5g6b5 and store it into {d28, d29} */
-    vshll.u8    q14, d16, #8
-    vshll.u8    q8, d19, #8
-    vshll.u8    q9, d18, #8
-    vsri.u16    q14, q8, #5
-    vsri.u16    q14, q9, #11
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_n_0565_process_pixblock_tail_head
-    pixman_composite_over_n_0565_process_pixblock_tail
-    vld1.16     {d4, d5}, [DST_R, :128]!
-    vst1.16     {d28, d29}, [DST_W, :128]!
-    pixman_composite_over_n_0565_process_pixblock_head
-    cache_preload 8, 8
-.endm
-
-.macro pixman_composite_over_n_0565_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vld1.32     {d3[0]}, [DUMMY]
-    vdup.8      d0, d3[0]
-    vdup.8      d1, d3[1]
-    vdup.8      d2, d3[2]
-    vdup.8      d3, d3[3]
-    vmvn.8      d3, d3      /* invert source alpha */
-.endm
-
-generate_composite_function \
-    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
-    FLAG_DST_READWRITE, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_n_0565_init, \
-    default_cleanup, \
-    pixman_composite_over_n_0565_process_pixblock_head, \
-    pixman_composite_over_n_0565_process_pixblock_tail, \
-    pixman_composite_over_n_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    24  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_8888_0565_process_pixblock_head
-    vshll.u8    q8, d1, #8
-    vshll.u8    q14, d2, #8
-    vshll.u8    q9, d0, #8
-.endm
-
-.macro pixman_composite_src_8888_0565_process_pixblock_tail
-    vsri.u16    q14, q8, #5
-    vsri.u16    q14, q9, #11
-.endm
-
-.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
-        vsri.u16    q14, q8, #5
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
-    fetch_src_pixblock
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
-        vsri.u16    q14, q9, #11
-                                    PF cmp PF_X, ORIG_W
-                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-    vshll.u8    q8, d1, #8
-        vst1.16     {d28, d29}, [DST_W, :128]!
-                                    PF subge PF_X, PF_X, ORIG_W
-                                    PF subges PF_CTL, PF_CTL, #0x10
-    vshll.u8    q14, d2, #8
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-    vshll.u8    q9, d0, #8
-.endm
-
-generate_composite_function \
-    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
-    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_src_8888_0565_process_pixblock_head, \
-    pixman_composite_src_8888_0565_process_pixblock_tail, \
-    pixman_composite_src_8888_0565_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_src_0565_8888_process_pixblock_head
-    vshrn.u16   d30, q0, #8
-    vshrn.u16   d29, q0, #3
-    vsli.u16    q0, q0, #5
-    vmov.u8     d31, #255
-    vsri.u8     d30, d30, #5
-    vsri.u8     d29, d29, #6
-    vshrn.u16   d28, q0, #2
-.endm
-
-.macro pixman_composite_src_0565_8888_process_pixblock_tail
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
-    pixman_composite_src_0565_8888_process_pixblock_tail
-    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
-    fetch_src_pixblock
-    pixman_composite_src_0565_8888_process_pixblock_head
-    cache_preload 8, 8
-.endm
-
-generate_composite_function \
-    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
-    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_src_0565_8888_process_pixblock_head, \
-    pixman_composite_src_0565_8888_process_pixblock_tail, \
-    pixman_composite_src_0565_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_add_8_8_process_pixblock_head
-    vqadd.u8    q14, q0, q2
-    vqadd.u8    q15, q1, q3
-.endm
-
-.macro pixman_composite_add_8_8_process_pixblock_tail
-.endm
-
-.macro pixman_composite_add_8_8_process_pixblock_tail_head
-    fetch_src_pixblock
-                                    PF add PF_X, PF_X, #32
-                                    PF tst PF_CTL, #0xF
-    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
-                                    PF addne PF_X, PF_X, #32
-                                    PF subne PF_CTL, PF_CTL, #1
-        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF cmp PF_X, ORIG_W
-                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
-                                    PF subge PF_X, PF_X, ORIG_W
-                                    PF subges PF_CTL, PF_CTL, #0x10
-    vqadd.u8    q14, q0, q2
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
-    vqadd.u8    q15, q1, q3
-.endm
-
-generate_composite_function \
-    pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
-    FLAG_DST_READWRITE, \
-    32, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_add_8_8_process_pixblock_head, \
-    pixman_composite_add_8_8_process_pixblock_tail, \
-    pixman_composite_add_8_8_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
-    fetch_src_pixblock
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
-    vld1.32     {d4, d5, d6, d7}, [DST_R, :128]!
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
-        vst1.32     {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF cmp PF_X, ORIG_W
-                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
-                                    PF subge PF_X, PF_X, ORIG_W
-                                    PF subges PF_CTL, PF_CTL, #0x10
-    vqadd.u8    q14, q0, q2
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
-    vqadd.u8    q15, q1, q3
-.endm
-
-generate_composite_function \
-    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
-    FLAG_DST_READWRITE, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_add_8_8_process_pixblock_head, \
-    pixman_composite_add_8_8_process_pixblock_tail, \
-    pixman_composite_add_8888_8888_process_pixblock_tail_head
-
-generate_composite_function_single_scanline \
-    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
-    FLAG_DST_READWRITE, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_add_8_8_process_pixblock_head, \
-    pixman_composite_add_8_8_process_pixblock_tail, \
-    pixman_composite_add_8888_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
-    vmvn.8      d24, d3  /* get inverted alpha */
-    /* do alpha blending */
-    vmull.u8    q8, d24, d4
-    vmull.u8    q9, d24, d5
-    vmull.u8    q10, d24, d6
-    vmull.u8    q11, d24, d7
-.endm
-
-.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
-    vrshr.u16   q14, q8, #8
-    vrshr.u16   q15, q9, #8
-    vrshr.u16   q12, q10, #8
-    vrshr.u16   q13, q11, #8
-    vraddhn.u16 d28, q14, q8
-    vraddhn.u16 d29, q15, q9
-    vraddhn.u16 d30, q12, q10
-    vraddhn.u16 d31, q13, q11
-.endm
-
-.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
-    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
-        vrshr.u16   q14, q8, #8
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
-        vrshr.u16   q15, q9, #8
-        vrshr.u16   q12, q10, #8
-        vrshr.u16   q13, q11, #8
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
-        vraddhn.u16 d28, q14, q8
-        vraddhn.u16 d29, q15, q9
-                                    PF cmp PF_X, ORIG_W
-        vraddhn.u16 d30, q12, q10
-        vraddhn.u16 d31, q13, q11
-    fetch_src_pixblock
-                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-    vmvn.8      d22, d3
-                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
-        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF subge PF_X, PF_X, ORIG_W
-    vmull.u8    q8, d22, d4
-                                    PF subges PF_CTL, PF_CTL, #0x10
-    vmull.u8    q9, d22, d5
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-    vmull.u8    q10, d22, d6
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
-    vmull.u8    q11, d22, d7
-.endm
-
-generate_composite_function_single_scanline \
-    pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
-    pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
-    pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_8888_8888_process_pixblock_head
-    pixman_composite_out_reverse_8888_8888_process_pixblock_head
-.endm
-
-.macro pixman_composite_over_8888_8888_process_pixblock_tail
-    pixman_composite_out_reverse_8888_8888_process_pixblock_tail
-    vqadd.u8    q14, q0, q14
-    vqadd.u8    q15, q1, q15
-.endm
-
-.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
-    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
-        vrshr.u16   q14, q8, #8
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
-        vrshr.u16   q15, q9, #8
-        vrshr.u16   q12, q10, #8
-        vrshr.u16   q13, q11, #8
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
-        vraddhn.u16 d28, q14, q8
-        vraddhn.u16 d29, q15, q9
-                                    PF cmp PF_X, ORIG_W
-        vraddhn.u16 d30, q12, q10
-        vraddhn.u16 d31, q13, q11
-        vqadd.u8    q14, q0, q14
-        vqadd.u8    q15, q1, q15
-    fetch_src_pixblock
-                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-    vmvn.8      d22, d3
-                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
-        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF subge PF_X, PF_X, ORIG_W
-    vmull.u8    q8, d22, d4
-                                    PF subges PF_CTL, PF_CTL, #0x10
-    vmull.u8    q9, d22, d5
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-    vmull.u8    q10, d22, d6
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
-    vmull.u8    q11, d22, d7
-.endm
-
-generate_composite_function \
-    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_over_8888_8888_process_pixblock_head, \
-    pixman_composite_over_8888_8888_process_pixblock_tail, \
-    pixman_composite_over_8888_8888_process_pixblock_tail_head
-
-generate_composite_function_single_scanline \
-    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_over_8888_8888_process_pixblock_head, \
-    pixman_composite_over_8888_8888_process_pixblock_tail, \
-    pixman_composite_over_8888_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_n_8888_process_pixblock_tail_head
-    pixman_composite_over_8888_8888_process_pixblock_tail
-    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
-    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-    pixman_composite_over_8888_8888_process_pixblock_head
-    cache_preload 8, 8
-.endm
-
-.macro pixman_composite_over_n_8888_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vld1.32     {d3[0]}, [DUMMY]
-    vdup.8      d0, d3[0]
-    vdup.8      d1, d3[1]
-    vdup.8      d2, d3[2]
-    vdup.8      d3, d3[3]
-.endm
-
-generate_composite_function \
-    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_n_8888_init, \
-    default_cleanup, \
-    pixman_composite_over_8888_8888_process_pixblock_head, \
-    pixman_composite_over_8888_8888_process_pixblock_tail, \
-    pixman_composite_over_n_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
-        vrshr.u16   q14, q8, #8
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
-        vrshr.u16   q15, q9, #8
-        vrshr.u16   q12, q10, #8
-        vrshr.u16   q13, q11, #8
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
-        vraddhn.u16 d28, q14, q8
-        vraddhn.u16 d29, q15, q9
-                                    PF cmp PF_X, ORIG_W
-        vraddhn.u16 d30, q12, q10
-        vraddhn.u16 d31, q13, q11
-        vqadd.u8    q14, q0, q14
-        vqadd.u8    q15, q1, q15
-    vld4.8      {d0, d1, d2, d3}, [DST_R, :128]!
-    vmvn.8      d22, d3
-                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
-        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF subge PF_X, PF_X, ORIG_W
-    vmull.u8    q8, d22, d4
-                                    PF subges PF_CTL, PF_CTL, #0x10
-    vmull.u8    q9, d22, d5
-    vmull.u8    q10, d22, d6
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
-    vmull.u8    q11, d22, d7
-.endm
-
-.macro pixman_composite_over_reverse_n_8888_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vld1.32     {d7[0]}, [DUMMY]
-    vdup.8      d4, d7[0]
-    vdup.8      d5, d7[1]
-    vdup.8      d6, d7[2]
-    vdup.8      d7, d7[3]
-.endm
-
-generate_composite_function \
-    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_reverse_n_8888_init, \
-    default_cleanup, \
-    pixman_composite_over_8888_8888_process_pixblock_head, \
-    pixman_composite_over_8888_8888_process_pixblock_tail, \
-    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    0,  /* dst_r_basereg */ \
-    4,  /* src_basereg   */ \
-    24  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_8888_8_0565_process_pixblock_head
-    vmull.u8    q0,  d24, d8    /* IN for SRC pixels (part1) */
-    vmull.u8    q1,  d24, d9
-    vmull.u8    q6,  d24, d10
-    vmull.u8    q7,  d24, d11
-        vshrn.u16   d6,  q2, #8 /* convert DST_R data to 32-bpp (part1) */
-        vshrn.u16   d7,  q2, #3
-        vsli.u16    q2,  q2, #5
-    vrshr.u16   q8,  q0,  #8    /* IN for SRC pixels (part2) */
-    vrshr.u16   q9,  q1,  #8
-    vrshr.u16   q10, q6,  #8
-    vrshr.u16   q11, q7,  #8
-    vraddhn.u16 d0,  q0,  q8
-    vraddhn.u16 d1,  q1,  q9
-    vraddhn.u16 d2,  q6,  q10
-    vraddhn.u16 d3,  q7,  q11
-        vsri.u8     d6,  d6, #5 /* convert DST_R data to 32-bpp (part2) */
-        vsri.u8     d7,  d7, #6
-    vmvn.8      d3,  d3
-        vshrn.u16   d30, q2, #2
-    vmull.u8    q8,  d3, d6     /* now do alpha blending */
-    vmull.u8    q9,  d3, d7
-    vmull.u8    q10, d3, d30
-.endm
-
-.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
-    /* 3 cycle bubble (after vmull.u8) */
-    vrshr.u16   q13, q8,  #8
-    vrshr.u16   q11, q9,  #8
-    vrshr.u16   q15, q10, #8
-    vraddhn.u16 d16, q8,  q13
-    vraddhn.u16 d27, q9,  q11
-    vraddhn.u16 d26, q10, q15
-    vqadd.u8    d16, d2,  d16
-    /* 1 cycle bubble */
-    vqadd.u8    q9,  q0,  q13
-    vshll.u8    q14, d16, #8    /* convert to 16bpp */
-    vshll.u8    q8,  d19, #8
-    vshll.u8    q9,  d18, #8
-    vsri.u16    q14, q8,  #5
-    /* 1 cycle bubble */
-    vsri.u16    q14, q9,  #11
-.endm
-
-.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
-    vld1.16     {d4, d5}, [DST_R, :128]!
-    vshrn.u16   d6,  q2,  #8
-    fetch_mask_pixblock
-    vshrn.u16   d7,  q2,  #3
-    fetch_src_pixblock
-    vmull.u8    q6,  d24, d10
-        vrshr.u16   q13, q8,  #8
-        vrshr.u16   q11, q9,  #8
-        vrshr.u16   q15, q10, #8
-        vraddhn.u16 d16, q8,  q13
-        vraddhn.u16 d27, q9,  q11
-        vraddhn.u16 d26, q10, q15
-        vqadd.u8    d16, d2,  d16
-    vmull.u8    q1,  d24, d9
-        vqadd.u8    q9,  q0,  q13
-        vshll.u8    q14, d16, #8
-    vmull.u8    q0,  d24, d8
-        vshll.u8    q8,  d19, #8
-        vshll.u8    q9,  d18, #8
-        vsri.u16    q14, q8,  #5
-    vmull.u8    q7,  d24, d11
-        vsri.u16    q14, q9,  #11
-
-    cache_preload 8, 8
-
-    vsli.u16    q2,  q2,  #5
-    vrshr.u16   q8,  q0,  #8
-    vrshr.u16   q9,  q1,  #8
-    vrshr.u16   q10, q6,  #8
-    vrshr.u16   q11, q7,  #8
-    vraddhn.u16 d0,  q0,  q8
-    vraddhn.u16 d1,  q1,  q9
-    vraddhn.u16 d2,  q6,  q10
-    vraddhn.u16 d3,  q7,  q11
-    vsri.u8     d6,  d6,  #5
-    vsri.u8     d7,  d7,  #6
-    vmvn.8      d3,  d3
-    vshrn.u16   d30, q2,  #2
-    vst1.16     {d28, d29}, [DST_W, :128]!
-    vmull.u8    q8,  d3,  d6
-    vmull.u8    q9,  d3,  d7
-    vmull.u8    q10, d3,  d30
-.endm
-
-generate_composite_function \
-    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_over_8888_8_0565_process_pixblock_head, \
-    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
-    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    8,  /* src_basereg   */ \
-    24  /* mask_basereg  */
-
-/******************************************************************************/
-
-/*
- * This function needs a special initialization of solid mask.
- * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
- * offset, split into color components and replicated in d8-d11
- * registers. Additionally, this function needs all the NEON registers,
- * so it has to save d8-d15 registers which are callee saved according
- * to ABI. These registers are restored from 'cleanup' macro. All the
- * other NEON registers are caller saved, so can be clobbered freely
- * without introducing any problems.
- */
-.macro pixman_composite_over_n_8_0565_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vpush       {d8-d15}
-    vld1.32     {d11[0]}, [DUMMY]
-    vdup.8      d8, d11[0]
-    vdup.8      d9, d11[1]
-    vdup.8      d10, d11[2]
-    vdup.8      d11, d11[3]
-.endm
-
-.macro pixman_composite_over_n_8_0565_cleanup
-    vpop        {d8-d15}
-.endm
-
-generate_composite_function \
-    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
-    FLAG_DST_READWRITE, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_n_8_0565_init, \
-    pixman_composite_over_n_8_0565_cleanup, \
-    pixman_composite_over_8888_8_0565_process_pixblock_head, \
-    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
-    pixman_composite_over_8888_8_0565_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_8888_n_0565_init
-    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
-    vpush       {d8-d15}
-    vld1.32     {d24[0]}, [DUMMY]
-    vdup.8      d24, d24[3]
-.endm
-
-.macro pixman_composite_over_8888_n_0565_cleanup
-    vpop        {d8-d15}
-.endm
-
-generate_composite_function \
-    pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_8888_n_0565_init, \
-    pixman_composite_over_8888_n_0565_cleanup, \
-    pixman_composite_over_8888_8_0565_process_pixblock_head, \
-    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
-    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    8,  /* src_basereg   */ \
-    24  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_0565_0565_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_0565_0565_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
-    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
-    fetch_src_pixblock
-    cache_preload 16, 16
-.endm
-
-generate_composite_function \
-    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
-    FLAG_DST_WRITEONLY, \
-    16, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_src_0565_0565_process_pixblock_head, \
-    pixman_composite_src_0565_0565_process_pixblock_tail, \
-    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
-    0, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_n_8_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_n_8_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_n_8_process_pixblock_tail_head
-    vst1.8  {d0, d1, d2, d3}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_src_n_8_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vld1.32     {d0[0]}, [DUMMY]
-    vsli.u64    d0, d0, #8
-    vsli.u64    d0, d0, #16
-    vsli.u64    d0, d0, #32
-    vorr        d1, d0, d0
-    vorr        q1, q0, q0
-.endm
-
-.macro pixman_composite_src_n_8_cleanup
-.endm
-
-generate_composite_function \
-    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
-    FLAG_DST_WRITEONLY, \
-    32, /* number of pixels, processed in a single block */ \
-    0,  /* prefetch distance */ \
-    pixman_composite_src_n_8_init, \
-    pixman_composite_src_n_8_cleanup, \
-    pixman_composite_src_n_8_process_pixblock_head, \
-    pixman_composite_src_n_8_process_pixblock_tail, \
-    pixman_composite_src_n_8_process_pixblock_tail_head, \
-    0, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_n_0565_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_n_0565_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_n_0565_process_pixblock_tail_head
-    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_src_n_0565_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vld1.32     {d0[0]}, [DUMMY]
-    vsli.u64    d0, d0, #16
-    vsli.u64    d0, d0, #32
-    vorr        d1, d0, d0
-    vorr        q1, q0, q0
-.endm
-
-.macro pixman_composite_src_n_0565_cleanup
-.endm
-
-generate_composite_function \
-    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
-    FLAG_DST_WRITEONLY, \
-    16, /* number of pixels, processed in a single block */ \
-    0,  /* prefetch distance */ \
-    pixman_composite_src_n_0565_init, \
-    pixman_composite_src_n_0565_cleanup, \
-    pixman_composite_src_n_0565_process_pixblock_head, \
-    pixman_composite_src_n_0565_process_pixblock_tail, \
-    pixman_composite_src_n_0565_process_pixblock_tail_head, \
-    0, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_n_8888_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_n_8888_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_n_8888_process_pixblock_tail_head
-    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_src_n_8888_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vld1.32     {d0[0]}, [DUMMY]
-    vsli.u64    d0, d0, #32
-    vorr        d1, d0, d0
-    vorr        q1, q0, q0
-.endm
-
-.macro pixman_composite_src_n_8888_cleanup
-.endm
-
-generate_composite_function \
-    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
-    FLAG_DST_WRITEONLY, \
-    8, /* number of pixels, processed in a single block */ \
-    0, /* prefetch distance */ \
-    pixman_composite_src_n_8888_init, \
-    pixman_composite_src_n_8888_cleanup, \
-    pixman_composite_src_n_8888_process_pixblock_head, \
-    pixman_composite_src_n_8888_process_pixblock_tail, \
-    pixman_composite_src_n_8888_process_pixblock_tail_head, \
-    0, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_8888_8888_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_8888_8888_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
-    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
-    fetch_src_pixblock
-    cache_preload 8, 8
-.endm
-
-generate_composite_function \
-    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
-    FLAG_DST_WRITEONLY, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_src_8888_8888_process_pixblock_head, \
-    pixman_composite_src_8888_8888_process_pixblock_tail, \
-    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
-    0, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_x888_8888_process_pixblock_head
-    vorr     q0, q0, q2
-    vorr     q1, q1, q2
-.endm
-
-.macro pixman_composite_src_x888_8888_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
-    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
-    fetch_src_pixblock
-    vorr     q0, q0, q2
-    vorr     q1, q1, q2
-    cache_preload 8, 8
-.endm
-
-.macro pixman_composite_src_x888_8888_init
-    vmov.u8  q2, #0xFF
-    vshl.u32 q2, q2, #24
-.endm
-
-generate_composite_function \
-    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
-    FLAG_DST_WRITEONLY, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    pixman_composite_src_x888_8888_init, \
-    default_cleanup, \
-    pixman_composite_src_x888_8888_process_pixblock_head, \
-    pixman_composite_src_x888_8888_process_pixblock_tail, \
-    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
-    0, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_n_8_8888_process_pixblock_head
-    /* expecting deinterleaved source data in {d8, d9, d10, d11} */
-    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
-    /* and destination data in {d4, d5, d6, d7} */
-    /* mask is in d24 (d25, d26, d27 are unused) */
-
-    /* in */
-    vmull.u8    q0, d24, d8
-    vmull.u8    q1, d24, d9
-    vmull.u8    q6, d24, d10
-    vmull.u8    q7, d24, d11
-    vrshr.u16   q10, q0, #8
-    vrshr.u16   q11, q1, #8
-    vrshr.u16   q12, q6, #8
-    vrshr.u16   q13, q7, #8
-    vraddhn.u16 d0, q0, q10
-    vraddhn.u16 d1, q1, q11
-    vraddhn.u16 d2, q6, q12
-    vraddhn.u16 d3, q7, q13
-    vmvn.8      d24, d3  /* get inverted alpha */
-    /* source:      d0 - blue, d1 - green, d2 - red, d3 - alpha */
-    /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
-    /* now do alpha blending */
-    vmull.u8    q8, d24, d4
-    vmull.u8    q9, d24, d5
-    vmull.u8    q10, d24, d6
-    vmull.u8    q11, d24, d7
-.endm
-
-.macro pixman_composite_over_n_8_8888_process_pixblock_tail
-    vrshr.u16   q14, q8, #8
-    vrshr.u16   q15, q9, #8
-    vrshr.u16   q12, q10, #8
-    vrshr.u16   q13, q11, #8
-    vraddhn.u16 d28, q14, q8
-    vraddhn.u16 d29, q15, q9
-    vraddhn.u16 d30, q12, q10
-    vraddhn.u16 d31, q13, q11
-    vqadd.u8    q14, q0, q14
-    vqadd.u8    q15, q1, q15
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
-    pixman_composite_over_n_8_8888_process_pixblock_tail
-    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
-    fetch_mask_pixblock
-    cache_preload 8, 8
-    pixman_composite_over_n_8_8888_process_pixblock_head
-.endm
-
-.macro pixman_composite_over_n_8_8888_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vpush       {d8-d15}
-    vld1.32     {d11[0]}, [DUMMY]
-    vdup.8      d8, d11[0]
-    vdup.8      d9, d11[1]
-    vdup.8      d10, d11[2]
-    vdup.8      d11, d11[3]
-.endm
-
-.macro pixman_composite_over_n_8_8888_cleanup
-    vpop        {d8-d15}
-.endm
-
-generate_composite_function \
-    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_n_8_8888_init, \
-    pixman_composite_over_n_8_8888_cleanup, \
-    pixman_composite_over_n_8_8888_process_pixblock_head, \
-    pixman_composite_over_n_8_8888_process_pixblock_tail, \
-    pixman_composite_over_n_8_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_n_8_8_process_pixblock_head
-    vmull.u8    q0,  d24, d8
-    vmull.u8    q1,  d25, d8
-    vmull.u8    q6,  d26, d8
-    vmull.u8    q7,  d27, d8
-    vrshr.u16   q10, q0,  #8
-    vrshr.u16   q11, q1,  #8
-    vrshr.u16   q12, q6,  #8
-    vrshr.u16   q13, q7,  #8
-    vraddhn.u16 d0,  q0,  q10
-    vraddhn.u16 d1,  q1,  q11
-    vraddhn.u16 d2,  q6,  q12
-    vraddhn.u16 d3,  q7,  q13
-    vmvn.8      q12, q0
-    vmvn.8      q13, q1
-    vmull.u8    q8,  d24, d4
-    vmull.u8    q9,  d25, d5
-    vmull.u8    q10, d26, d6
-    vmull.u8    q11, d27, d7
-.endm
-
-.macro pixman_composite_over_n_8_8_process_pixblock_tail
-    vrshr.u16   q14, q8,  #8
-    vrshr.u16   q15, q9,  #8
-    vrshr.u16   q12, q10, #8
-    vrshr.u16   q13, q11, #8
-    vraddhn.u16 d28, q14, q8
-    vraddhn.u16 d29, q15, q9
-    vraddhn.u16 d30, q12, q10
-    vraddhn.u16 d31, q13, q11
-    vqadd.u8    q14, q0,  q14
-    vqadd.u8    q15, q1,  q15
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
-    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
-    pixman_composite_over_n_8_8_process_pixblock_tail
-    fetch_mask_pixblock
-    cache_preload 32, 32
-    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
-    pixman_composite_over_n_8_8_process_pixblock_head
-.endm
-
-.macro pixman_composite_over_n_8_8_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vpush       {d8-d15}
-    vld1.32     {d8[0]}, [DUMMY]
-    vdup.8      d8, d8[3]
-.endm
-
-.macro pixman_composite_over_n_8_8_cleanup
-    vpop        {d8-d15}
-.endm
-
-generate_composite_function \
-    pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
-    FLAG_DST_READWRITE, \
-    32, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_n_8_8_init, \
-    pixman_composite_over_n_8_8_cleanup, \
-    pixman_composite_over_n_8_8_process_pixblock_head, \
-    pixman_composite_over_n_8_8_process_pixblock_tail, \
-    pixman_composite_over_n_8_8_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
-    /*
-     * 'combine_mask_ca' replacement
-     *
-     * input:  solid src (n) in {d8,  d9,  d10, d11}
-     *         dest in          {d4,  d5,  d6,  d7 }
-     *         mask in          {d24, d25, d26, d27}
-     * output: updated src in   {d0,  d1,  d2,  d3 }
-     *         updated mask in  {d24, d25, d26, d3 }
-     */
-    vmull.u8    q0,  d24, d8
-    vmull.u8    q1,  d25, d9
-    vmull.u8    q6,  d26, d10
-    vmull.u8    q7,  d27, d11
-    vmull.u8    q9,  d11, d25
-    vmull.u8    q12, d11, d24
-    vmull.u8    q13, d11, d26
-    vrshr.u16   q8,  q0,  #8
-    vrshr.u16   q10, q1,  #8
-    vrshr.u16   q11, q6,  #8
-    vraddhn.u16 d0,  q0,  q8
-    vraddhn.u16 d1,  q1,  q10
-    vraddhn.u16 d2,  q6,  q11
-    vrshr.u16   q11, q12, #8
-    vrshr.u16   q8,  q9,  #8
-    vrshr.u16   q6,  q13, #8
-    vrshr.u16   q10, q7,  #8
-    vraddhn.u16 d24, q12, q11
-    vraddhn.u16 d25, q9,  q8
-    vraddhn.u16 d26, q13, q6
-    vraddhn.u16 d3,  q7,  q10
-    /*
-     * 'combine_over_ca' replacement
-     *
-     * output: updated dest in {d28, d29, d30, d31}
-     */
-    vmvn.8      d24, d24
-    vmvn.8      d25, d25
-    vmull.u8    q8,  d24, d4
-    vmull.u8    q9,  d25, d5
-    vmvn.8      d26, d26
-    vmvn.8      d27, d3
-    vmull.u8    q10, d26, d6
-    vmull.u8    q11, d27, d7
-.endm
-
-.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
-    /* ... continue 'combine_over_ca' replacement */
-    vrshr.u16   q14, q8,  #8
-    vrshr.u16   q15, q9,  #8
-    vrshr.u16   q6,  q10, #8
-    vrshr.u16   q7,  q11, #8
-    vraddhn.u16 d28, q14, q8
-    vraddhn.u16 d29, q15, q9
-    vraddhn.u16 d30, q6,  q10
-    vraddhn.u16 d31, q7,  q11
-    vqadd.u8    q14, q0,  q14
-    vqadd.u8    q15, q1,  q15
-.endm
-
-.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
-        vrshr.u16   q14, q8, #8
-        vrshr.u16   q15, q9, #8
-    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
-        vrshr.u16   q6, q10, #8
-        vrshr.u16   q7, q11, #8
-        vraddhn.u16 d28, q14, q8
-        vraddhn.u16 d29, q15, q9
-        vraddhn.u16 d30, q6, q10
-        vraddhn.u16 d31, q7, q11
-    fetch_mask_pixblock
-        vqadd.u8    q14, q0, q14
-        vqadd.u8    q15, q1, q15
-    cache_preload 8, 8
-    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
-    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_over_n_8888_8888_ca_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vpush       {d8-d15}
-    vld1.32     {d11[0]}, [DUMMY]
-    vdup.8      d8, d11[0]
-    vdup.8      d9, d11[1]
-    vdup.8      d10, d11[2]
-    vdup.8      d11, d11[3]
-.endm
-
-.macro pixman_composite_over_n_8888_8888_ca_cleanup
-    vpop        {d8-d15}
-.endm
-
-generate_composite_function \
-    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_n_8888_8888_ca_init, \
-    pixman_composite_over_n_8888_8888_ca_cleanup, \
-    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
-    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
-    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_in_n_8_process_pixblock_head
-    /* expecting source data in {d0, d1, d2, d3} */
-    /* and destination data in {d4, d5, d6, d7} */
-    vmull.u8    q8,  d4,  d3
-    vmull.u8    q9,  d5,  d3
-    vmull.u8    q10, d6,  d3
-    vmull.u8    q11, d7,  d3
-.endm
-
-.macro pixman_composite_in_n_8_process_pixblock_tail
-    vrshr.u16   q14, q8,  #8
-    vrshr.u16   q15, q9,  #8
-    vrshr.u16   q12, q10, #8
-    vrshr.u16   q13, q11, #8
-    vraddhn.u16 d28, q8,  q14
-    vraddhn.u16 d29, q9,  q15
-    vraddhn.u16 d30, q10, q12
-    vraddhn.u16 d31, q11, q13
-.endm
-
-.macro pixman_composite_in_n_8_process_pixblock_tail_head
-    pixman_composite_in_n_8_process_pixblock_tail
-    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
-    cache_preload 32, 32
-    pixman_composite_in_n_8_process_pixblock_head
-    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_in_n_8_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vld1.32     {d3[0]}, [DUMMY]
-    vdup.8      d3, d3[3]
-.endm
-
-.macro pixman_composite_in_n_8_cleanup
-.endm
-
-generate_composite_function \
-    pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
-    FLAG_DST_READWRITE, \
-    32, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_in_n_8_init, \
-    pixman_composite_in_n_8_cleanup, \
-    pixman_composite_in_n_8_process_pixblock_head, \
-    pixman_composite_in_n_8_process_pixblock_tail, \
-    pixman_composite_in_n_8_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    24  /* mask_basereg  */
-
-.macro pixman_composite_add_n_8_8_process_pixblock_head
-    /* expecting source data in {d8, d9, d10, d11} */
-    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
-    /* and destination data in {d4, d5, d6, d7} */
-    /* mask is in d24, d25, d26, d27 */
-    vmull.u8    q0, d24, d11
-    vmull.u8    q1, d25, d11
-    vmull.u8    q6, d26, d11
-    vmull.u8    q7, d27, d11
-    vrshr.u16   q10, q0, #8
-    vrshr.u16   q11, q1, #8
-    vrshr.u16   q12, q6, #8
-    vrshr.u16   q13, q7, #8
-    vraddhn.u16 d0, q0, q10
-    vraddhn.u16 d1, q1, q11
-    vraddhn.u16 d2, q6, q12
-    vraddhn.u16 d3, q7, q13
-    vqadd.u8    q14, q0, q2
-    vqadd.u8    q15, q1, q3
-.endm
-
-.macro pixman_composite_add_n_8_8_process_pixblock_tail
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
-    pixman_composite_add_n_8_8_process_pixblock_tail
-    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
-    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
-    fetch_mask_pixblock
-    cache_preload 32, 32
-    pixman_composite_add_n_8_8_process_pixblock_head
-.endm
-
-.macro pixman_composite_add_n_8_8_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vpush       {d8-d15}
-    vld1.32     {d11[0]}, [DUMMY]
-    vdup.8      d11, d11[3]
-.endm
-
-.macro pixman_composite_add_n_8_8_cleanup
-    vpop        {d8-d15}
-.endm
-
-generate_composite_function \
-    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
-    FLAG_DST_READWRITE, \
-    32, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_add_n_8_8_init, \
-    pixman_composite_add_n_8_8_cleanup, \
-    pixman_composite_add_n_8_8_process_pixblock_head, \
-    pixman_composite_add_n_8_8_process_pixblock_tail, \
-    pixman_composite_add_n_8_8_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_add_8_8_8_process_pixblock_head
-    /* expecting source data in {d0, d1, d2, d3} */
-    /* destination data in {d4, d5, d6, d7} */
-    /* mask in {d24, d25, d26, d27} */
-    vmull.u8    q8, d24, d0
-    vmull.u8    q9, d25, d1
-    vmull.u8    q10, d26, d2
-    vmull.u8    q11, d27, d3
-    vrshr.u16   q0, q8, #8
-    vrshr.u16   q1, q9, #8
-    vrshr.u16   q12, q10, #8
-    vrshr.u16   q13, q11, #8
-    vraddhn.u16 d0, q0, q8
-    vraddhn.u16 d1, q1, q9
-    vraddhn.u16 d2, q12, q10
-    vraddhn.u16 d3, q13, q11
-    vqadd.u8    q14, q0, q2
-    vqadd.u8    q15, q1, q3
-.endm
-
-.macro pixman_composite_add_8_8_8_process_pixblock_tail
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
-    pixman_composite_add_8_8_8_process_pixblock_tail
-    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
-    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
-    fetch_mask_pixblock
-    fetch_src_pixblock
-    cache_preload 32, 32
-    pixman_composite_add_8_8_8_process_pixblock_head
-.endm
-
-.macro pixman_composite_add_8_8_8_init
-.endm
-
-.macro pixman_composite_add_8_8_8_cleanup
-.endm
-
-generate_composite_function \
-    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
-    FLAG_DST_READWRITE, \
-    32, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_add_8_8_8_init, \
-    pixman_composite_add_8_8_8_cleanup, \
-    pixman_composite_add_8_8_8_process_pixblock_head, \
-    pixman_composite_add_8_8_8_process_pixblock_tail, \
-    pixman_composite_add_8_8_8_process_pixblock_tail_head
-
-/******************************************************************************/
-
-.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
-    /* expecting source data in {d0, d1, d2, d3} */
-    /* destination data in {d4, d5, d6, d7} */
-    /* mask in {d24, d25, d26, d27} */
-    vmull.u8    q8,  d27, d0
-    vmull.u8    q9,  d27, d1
-    vmull.u8    q10, d27, d2
-    vmull.u8    q11, d27, d3
-    /* 1 cycle bubble */
-    vrsra.u16   q8,  q8,  #8
-    vrsra.u16   q9,  q9,  #8
-    vrsra.u16   q10, q10, #8
-    vrsra.u16   q11, q11, #8
-.endm
-
-.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
-    /* 2 cycle bubble */
-    vrshrn.u16  d28, q8,  #8
-    vrshrn.u16  d29, q9,  #8
-    vrshrn.u16  d30, q10, #8
-    vrshrn.u16  d31, q11, #8
-    vqadd.u8    q14, q2,  q14
-    /* 1 cycle bubble */
-    vqadd.u8    q15, q3,  q15
-.endm
-
-.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
-    fetch_src_pixblock
-        vrshrn.u16  d28, q8,  #8
-    fetch_mask_pixblock
-        vrshrn.u16  d29, q9,  #8
-    vmull.u8    q8,  d27, d0
-        vrshrn.u16  d30, q10, #8
-    vmull.u8    q9,  d27, d1
-        vrshrn.u16  d31, q11, #8
-    vmull.u8    q10, d27, d2
-        vqadd.u8    q14, q2,  q14
-    vmull.u8    q11, d27, d3
-        vqadd.u8    q15, q3,  q15
-    vrsra.u16   q8,  q8,  #8
-    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
-    vrsra.u16   q9,  q9,  #8
-        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-    vrsra.u16   q10, q10, #8
-
-    cache_preload 8, 8
-
-    vrsra.u16   q11, q11, #8
-.endm
-
-generate_composite_function \
-    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
-
-generate_composite_function_single_scanline \
-    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-generate_composite_function \
-    pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    27  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_add_n_8_8888_init
-    add         DUMMY, sp, #ARGS_STACK_OFFSET
-    vld1.32     {d3[0]}, [DUMMY]
-    vdup.8      d0, d3[0]
-    vdup.8      d1, d3[1]
-    vdup.8      d2, d3[2]
-    vdup.8      d3, d3[3]
-.endm
-
-.macro pixman_composite_add_n_8_8888_cleanup
-.endm
-
-generate_composite_function \
-    pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_add_n_8_8888_init, \
-    pixman_composite_add_n_8_8888_cleanup, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    27  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_add_8888_n_8888_init
-    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
-    vld1.32     {d27[0]}, [DUMMY]
-    vdup.8      d27, d27[3]
-.endm
-
-.macro pixman_composite_add_8888_n_8888_cleanup
-.endm
-
-generate_composite_function \
-    pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_add_8888_n_8888_init, \
-    pixman_composite_add_8888_n_8888_cleanup, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
-    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    27  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
-    /* expecting source data in {d0, d1, d2, d3} */
-    /* destination data in {d4, d5, d6, d7} */
-    /* solid mask is in d15 */
-
-    /* 'in' */
-    vmull.u8    q8, d15, d3
-    vmull.u8    q6, d15, d2
-    vmull.u8    q5, d15, d1
-    vmull.u8    q4, d15, d0
-    vrshr.u16   q13, q8, #8
-    vrshr.u16   q12, q6, #8
-    vrshr.u16   q11, q5, #8
-    vrshr.u16   q10, q4, #8
-    vraddhn.u16 d3, q8, q13
-    vraddhn.u16 d2, q6, q12
-    vraddhn.u16 d1, q5, q11
-    vraddhn.u16 d0, q4, q10
-    vmvn.8      d24, d3  /* get inverted alpha */
-    /* now do alpha blending */
-    vmull.u8    q8, d24, d4
-    vmull.u8    q9, d24, d5
-    vmull.u8    q10, d24, d6
-    vmull.u8    q11, d24, d7
-.endm
-
-.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
-    vrshr.u16   q14, q8, #8
-    vrshr.u16   q15, q9, #8
-    vrshr.u16   q12, q10, #8
-    vrshr.u16   q13, q11, #8
-    vraddhn.u16 d28, q14, q8
-    vraddhn.u16 d29, q15, q9
-    vraddhn.u16 d30, q12, q10
-    vraddhn.u16 d31, q13, q11
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
-    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
-    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
-    fetch_src_pixblock
-    cache_preload 8, 8
-    fetch_mask_pixblock
-    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
-    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-generate_composite_function_single_scanline \
-    pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
-    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
-    pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    12  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_8888_n_8888_process_pixblock_head
-    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
-.endm
-
-.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
-    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
-    vqadd.u8    q14, q0, q14
-    vqadd.u8    q15, q1, q15
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
-    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
-    pixman_composite_over_8888_n_8888_process_pixblock_tail
-    fetch_src_pixblock
-    cache_preload 8, 8
-    pixman_composite_over_8888_n_8888_process_pixblock_head
-    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-.macro pixman_composite_over_8888_n_8888_init
-    add         DUMMY, sp, #48
-    vpush       {d8-d15}
-    vld1.32     {d15[0]}, [DUMMY]
-    vdup.8      d15, d15[3]
-.endm
-
-.macro pixman_composite_over_8888_n_8888_cleanup
-    vpop        {d8-d15}
-.endm
-
-generate_composite_function \
-    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_8888_n_8888_init, \
-    pixman_composite_over_8888_n_8888_cleanup, \
-    pixman_composite_over_8888_n_8888_process_pixblock_head, \
-    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
-    pixman_composite_over_8888_n_8888_process_pixblock_tail_head
-
-/******************************************************************************/
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
-    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
-    pixman_composite_over_8888_n_8888_process_pixblock_tail
-    fetch_src_pixblock
-    cache_preload 8, 8
-    fetch_mask_pixblock
-    pixman_composite_over_8888_n_8888_process_pixblock_head
-    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
-    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_over_8888_n_8888_process_pixblock_head, \
-    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
-    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    12  /* mask_basereg  */
-
-generate_composite_function_single_scanline \
-    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_over_8888_n_8888_process_pixblock_head, \
-    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
-    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    12  /* mask_basereg  */
-
-/******************************************************************************/
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
-    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
-    pixman_composite_over_8888_n_8888_process_pixblock_tail
-    fetch_src_pixblock
-    cache_preload 8, 8
-    fetch_mask_pixblock
-    pixman_composite_over_8888_n_8888_process_pixblock_head
-    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
-    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_over_8888_n_8888_process_pixblock_head, \
-    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
-    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    15  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_0888_0888_process_pixblock_head
-.endm
-
-.macro pixman_composite_src_0888_0888_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
-    vst3.8 {d0, d1, d2}, [DST_W]!
-    fetch_src_pixblock
-    cache_preload 8, 8
-.endm
-
-generate_composite_function \
-    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
-    FLAG_DST_WRITEONLY, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_src_0888_0888_process_pixblock_head, \
-    pixman_composite_src_0888_0888_process_pixblock_tail, \
-    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
-    0, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
-    vswp   d0, d2
-.endm
-
-.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
-.endm
-
-.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
-    vst4.8 {d0, d1, d2, d3}, [DST_W]!
-    fetch_src_pixblock
-    vswp   d0, d2
-    cache_preload 8, 8
-.endm
-
-.macro pixman_composite_src_0888_8888_rev_init
-    veor   d3, d3, d3
-.endm
-
-generate_composite_function \
-    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
-    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    pixman_composite_src_0888_8888_rev_init, \
-    default_cleanup, \
-    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
-    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
-    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
-    0, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
-    vshll.u8    q8, d1, #8
-    vshll.u8    q9, d2, #8
-.endm
-
-.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
-    vshll.u8    q14, d0, #8
-    vsri.u16    q14, q8, #5
-    vsri.u16    q14, q9, #11
-.endm
-
-.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
-        vshll.u8    q14, d0, #8
-    fetch_src_pixblock
-        vsri.u16    q14, q8, #5
-        vsri.u16    q14, q9, #11
-    vshll.u8    q8, d1, #8
-        vst1.16 {d28, d29}, [DST_W, :128]!
-    vshll.u8    q9, d2, #8
-.endm
-
-generate_composite_function \
-    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
-    FLAG_DST_WRITEONLY, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
-    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
-    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
-    vmull.u8    q8, d3, d0
-    vmull.u8    q9, d3, d1
-    vmull.u8    q10, d3, d2
-.endm
-
-.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
-    vrshr.u16   q11, q8, #8
-    vswp        d3, d31
-    vrshr.u16   q12, q9, #8
-    vrshr.u16   q13, q10, #8
-    vraddhn.u16 d30, q11, q8
-    vraddhn.u16 d29, q12, q9
-    vraddhn.u16 d28, q13, q10
-.endm
-
-.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
-        vrshr.u16   q11, q8, #8
-        vswp        d3, d31
-        vrshr.u16   q12, q9, #8
-        vrshr.u16   q13, q10, #8
-    fetch_src_pixblock
-        vraddhn.u16 d30, q11, q8
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
-        vraddhn.u16 d29, q12, q9
-        vraddhn.u16 d28, q13, q10
-    vmull.u8    q8, d3, d0
-    vmull.u8    q9, d3, d1
-    vmull.u8    q10, d3, d2
-        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF cmp PF_X, ORIG_W
-                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-                                    PF subge PF_X, PF_X, ORIG_W
-                                    PF subges PF_CTL, PF_CTL, #0x10
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-.endm
-
-generate_composite_function \
-    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
-    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
-    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
-    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
-    vmull.u8    q8, d3, d0
-    vmull.u8    q9, d3, d1
-    vmull.u8    q10, d3, d2
-.endm
-
-.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
-    vrshr.u16   q11, q8, #8
-    vswp        d3, d31
-    vrshr.u16   q12, q9, #8
-    vrshr.u16   q13, q10, #8
-    vraddhn.u16 d28, q11, q8
-    vraddhn.u16 d29, q12, q9
-    vraddhn.u16 d30, q13, q10
-.endm
-
-.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
-        vrshr.u16   q11, q8, #8
-        vswp        d3, d31
-        vrshr.u16   q12, q9, #8
-        vrshr.u16   q13, q10, #8
-    fetch_src_pixblock
-        vraddhn.u16 d28, q11, q8
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
-        vraddhn.u16 d29, q12, q9
-        vraddhn.u16 d30, q13, q10
-    vmull.u8    q8, d3, d0
-    vmull.u8    q9, d3, d1
-    vmull.u8    q10, d3, d2
-        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF cmp PF_X, ORIG_W
-                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-                                    PF subge PF_X, PF_X, ORIG_W
-                                    PF subges PF_CTL, PF_CTL, #0x10
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-.endm
-
-generate_composite_function \
-    pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
-    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    10, /* prefetch distance */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
-    pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
-    pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    0, /* dst_r_basereg */ \
-    0, /* src_basereg   */ \
-    0  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_0565_8_0565_process_pixblock_head
-    /* mask is in d15 */
-    convert_0565_to_x888 q4, d2, d1, d0
-    convert_0565_to_x888 q5, d6, d5, d4
-    /* source pixel data is in      {d0, d1, d2, XX} */
-    /* destination pixel data is in {d4, d5, d6, XX} */
-    vmvn.8      d7,  d15
-    vmull.u8    q6,  d15, d2
-    vmull.u8    q5,  d15, d1
-    vmull.u8    q4,  d15, d0
-    vmull.u8    q8,  d7,  d4
-    vmull.u8    q9,  d7,  d5
-    vmull.u8    q13, d7,  d6
-    vrshr.u16   q12, q6,  #8
-    vrshr.u16   q11, q5,  #8
-    vrshr.u16   q10, q4,  #8
-    vraddhn.u16 d2,  q6,  q12
-    vraddhn.u16 d1,  q5,  q11
-    vraddhn.u16 d0,  q4,  q10
-.endm
-
-.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
-    vrshr.u16   q14, q8,  #8
-    vrshr.u16   q15, q9,  #8
-    vrshr.u16   q12, q13, #8
-    vraddhn.u16 d28, q14, q8
-    vraddhn.u16 d29, q15, q9
-    vraddhn.u16 d30, q12, q13
-    vqadd.u8    q0,  q0,  q14
-    vqadd.u8    q1,  q1,  q15
-    /* 32bpp result is in {d0, d1, d2, XX} */
-    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
-    fetch_mask_pixblock
-    pixman_composite_over_0565_8_0565_process_pixblock_tail
-    fetch_src_pixblock
-    vld1.16    {d10, d11}, [DST_R, :128]!
-    cache_preload 8, 8
-    pixman_composite_over_0565_8_0565_process_pixblock_head
-    vst1.16    {d28, d29}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
-    pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
-    FLAG_DST_READWRITE, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_over_0565_8_0565_process_pixblock_head, \
-    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
-    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    10,  /* dst_r_basereg */ \
-    8,  /* src_basereg   */ \
-    15  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_over_0565_n_0565_init
-    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
-    vpush       {d8-d15}
-    vld1.32     {d15[0]}, [DUMMY]
-    vdup.8      d15, d15[3]
-.endm
-
-.macro pixman_composite_over_0565_n_0565_cleanup
-    vpop        {d8-d15}
-.endm
-
-generate_composite_function \
-    pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
-    FLAG_DST_READWRITE, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    pixman_composite_over_0565_n_0565_init, \
-    pixman_composite_over_0565_n_0565_cleanup, \
-    pixman_composite_over_0565_8_0565_process_pixblock_head, \
-    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
-    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    10, /* dst_r_basereg */ \
-    8,  /* src_basereg   */ \
-    15  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_add_0565_8_0565_process_pixblock_head
-    /* mask is in d15 */
-    convert_0565_to_x888 q4, d2, d1, d0
-    convert_0565_to_x888 q5, d6, d5, d4
-    /* source pixel data is in      {d0, d1, d2, XX} */
-    /* destination pixel data is in {d4, d5, d6, XX} */
-    vmull.u8    q6,  d15, d2
-    vmull.u8    q5,  d15, d1
-    vmull.u8    q4,  d15, d0
-    vrshr.u16   q12, q6,  #8
-    vrshr.u16   q11, q5,  #8
-    vrshr.u16   q10, q4,  #8
-    vraddhn.u16 d2,  q6,  q12
-    vraddhn.u16 d1,  q5,  q11
-    vraddhn.u16 d0,  q4,  q10
-.endm
-
-.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
-    vqadd.u8    q0,  q0,  q2
-    vqadd.u8    q1,  q1,  q3
-    /* 32bpp result is in {d0, d1, d2, XX} */
-    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
-    fetch_mask_pixblock
-    pixman_composite_add_0565_8_0565_process_pixblock_tail
-    fetch_src_pixblock
-    vld1.16    {d10, d11}, [DST_R, :128]!
-    cache_preload 8, 8
-    pixman_composite_add_0565_8_0565_process_pixblock_head
-    vst1.16    {d28, d29}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
-    pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
-    FLAG_DST_READWRITE, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_add_0565_8_0565_process_pixblock_head, \
-    pixman_composite_add_0565_8_0565_process_pixblock_tail, \
-    pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    10, /* dst_r_basereg */ \
-    8,  /* src_basereg   */ \
-    15  /* mask_basereg  */
-
-/******************************************************************************/
-
-.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
-    /* mask is in d15 */
-    convert_0565_to_x888 q5, d6, d5, d4
-    /* destination pixel data is in {d4, d5, d6, xx} */
-    vmvn.8      d24, d15 /* get inverted alpha */
-    /* now do alpha blending */
-    vmull.u8    q8, d24, d4
-    vmull.u8    q9, d24, d5
-    vmull.u8    q10, d24, d6
-.endm
-
-.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
-    vrshr.u16   q14, q8, #8
-    vrshr.u16   q15, q9, #8
-    vrshr.u16   q12, q10, #8
-    vraddhn.u16 d0, q14, q8
-    vraddhn.u16 d1, q15, q9
-    vraddhn.u16 d2, q12, q10
-    /* 32bpp result is in {d0, d1, d2, XX} */
-    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
-.endm
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
-    fetch_src_pixblock
-    pixman_composite_out_reverse_8_0565_process_pixblock_tail
-    vld1.16    {d10, d11}, [DST_R, :128]!
-    cache_preload 8, 8
-    pixman_composite_out_reverse_8_0565_process_pixblock_head
-    vst1.16    {d28, d29}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
-    pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
-    FLAG_DST_READWRITE, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_out_reverse_8_0565_process_pixblock_head, \
-    pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
-    pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    10, /* dst_r_basereg */ \
-    15, /* src_basereg   */ \
-    0   /* mask_basereg  */
-
-/******************************************************************************/
-
-generate_composite_function_nearest_scanline \
-    pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_over_8888_8888_process_pixblock_head, \
-    pixman_composite_over_8888_8888_process_pixblock_tail, \
-    pixman_composite_over_8888_8888_process_pixblock_tail_head
-
-generate_composite_function_nearest_scanline \
-    pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_over_8888_0565_process_pixblock_head, \
-    pixman_composite_over_8888_0565_process_pixblock_tail, \
-    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    0,  /* src_basereg   */ \
-    24  /* mask_basereg  */
-
-generate_composite_function_nearest_scanline \
-    pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
-    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_src_8888_0565_process_pixblock_head, \
-    pixman_composite_src_8888_0565_process_pixblock_tail, \
-    pixman_composite_src_8888_0565_process_pixblock_tail_head
-
-generate_composite_function_nearest_scanline \
-    pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
-    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    default_init, \
-    default_cleanup, \
-    pixman_composite_src_0565_8888_process_pixblock_head, \
-    pixman_composite_src_0565_8888_process_pixblock_tail, \
-    pixman_composite_src_0565_8888_process_pixblock_tail_head
+/*
+ * Copyright © 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains implementations of NEON optimized pixel processing
+ * functions. There is no full and detailed tutorial, but some functions
+ * (those which are exposing some new or interesting features) are
+ * extensively commented and can be used as examples.
+ *
+ * You may want to have a look at the comments for following functions:
+ *  - pixman_composite_over_8888_0565_asm_neon
+ *  - pixman_composite_over_n_8_0565_asm_neon
+ */
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+    .text
+    .fpu neon
+    .arch armv7a
+    .object_arch armv4
+    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
+    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
+    .arm
+    .altmacro
+
+#include "pixman-arm-neon-asm.h"
+
+/* Global configuration options and preferences */
+
+/*
+ * The code can optionally make use of unaligned memory accesses to improve
+ * performance of handling leading/trailing pixels for each scanline.
+ * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
+ * example in linux if unaligned memory accesses are not configured to
+ * generate.exceptions.
+ */
+.set RESPECT_STRICT_ALIGNMENT, 1
+
+/*
+ * Set default prefetch type. There is a choice between the following options:
+ *
+ * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
+ * as NOP to workaround some HW bugs or for whatever other reason)
+ *
+ * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
+ * advanced prefetch intruduces heavy overhead)
+ *
+ * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
+ * which can run ARM and NEON instructions simultaneously so that extra ARM
+ * instructions do not add (many) extra cycles, but improve prefetch efficiency)
+ *
+ * Note: some types of function can't support advanced prefetch and fallback
+ *       to simple one (those which handle 24bpp pixels)
+ */
+.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
+
+/* Prefetch distance in pixels for simple prefetch */
+.set PREFETCH_DISTANCE_SIMPLE, 64
+
+/*
+ * Implementation of pixman_composite_over_8888_0565_asm_neon
+ *
+ * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
+ * performs OVER compositing operation. Function fast_composite_over_8888_0565
+ * from pixman-fast-path.c does the same in C and can be used as a reference.
+ *
+ * First we need to have some NEON assembly code which can do the actual
+ * operation on the pixels and provide it to the template macro.
+ *
+ * Template macro quite conveniently takes care of emitting all the necessary
+ * code for memory reading and writing (including quite tricky cases of
+ * handling unaligned leading/trailing pixels), so we only need to deal with
+ * the data in NEON registers.
+ *
+ * NEON registers allocation in general is recommented to be the following:
+ * d0,  d1,  d2,  d3  - contain loaded source pixel data
+ * d4,  d5,  d6,  d7  - contain loaded destination pixels (if they are needed)
+ * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
+ * d28, d29, d30, d31 - place for storing the result (destination pixels)
+ *
+ * As can be seen above, four 64-bit NEON registers are used for keeping
+ * intermediate pixel data and up to 8 pixels can be processed in one step
+ * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
+ *
+ * This particular function uses the following registers allocation:
+ * d0,  d1,  d2,  d3  - contain loaded source pixel data
+ * d4,  d5            - contain loaded destination pixels (they are needed)
+ * d28, d29           - place for storing the result (destination pixels)
+ */
+
+/*
+ * Step one. We need to have some code to do some arithmetics on pixel data.
+ * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
+ * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
+ * perform all the needed calculations and write the result to {d28, d29}.
+ * The rationale for having two macros and not just one will be explained
+ * later. In practice, any single monolitic function which does the work can
+ * be split into two parts in any arbitrary way without affecting correctness.
+ *
+ * There is one special trick here too. Common template macro can optionally
+ * make our life a bit easier by doing R, G, B, A color components
+ * deinterleaving for 32bpp pixel formats (and this feature is used in
+ * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
+ * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
+ * actually use d0 register for blue channel (a vector of eight 8-bit
+ * values), d1 register for green, d2 for red and d3 for alpha. This
+ * simple conversion can be also done with a few NEON instructions:
+ *
+ * Packed to planar conversion:
+ *  vuzp.8 d0, d1
+ *  vuzp.8 d2, d3
+ *  vuzp.8 d1, d3
+ *  vuzp.8 d0, d2
+ *
+ * Planar to packed conversion:
+ *  vzip.8 d0, d2
+ *  vzip.8 d1, d3
+ *  vzip.8 d2, d3
+ *  vzip.8 d0, d1
+ *
+ * But pixel can be loaded directly in planar format using VLD4.8 NEON
+ * instruction. It is 1 cycle slower than VLD1.32, so this is not always
+ * desirable, that's why deinterleaving is optional.
+ *
+ * But anyway, here is the code:
+ */
+.macro pixman_composite_over_8888_0565_process_pixblock_head
+    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+       and put data into d6 - red, d7 - green, d30 - blue */
+    vshrn.u16   d6, q2, #8
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+    vsri.u8     d6, d6, #5
+    vmvn.8      d3, d3      /* invert source alpha */
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    /* now do alpha blending, storing results in 8-bit planar format
+       into d16 - red, d19 - green, d18 - blue */
+    vmull.u8    q10, d3, d6
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+    vrshr.u16   q13, q10, #8
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+    vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail
+    /* ... continue alpha blending */
+    vqadd.u8    d16, d2, d20
+    vqadd.u8    q9, q0, q11
+    /* convert the result to r5g6b5 and store it into {d28, d29} */
+    vshll.u8    q14, d16, #8
+    vshll.u8    q8, d19, #8
+    vshll.u8    q9, d18, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+/*
+ * OK, now we got almost everything that we need. Using the above two
+ * macros, the work can be done right. But now we want to optimize
+ * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
+ * a lot from good code scheduling and software pipelining.
+ *
+ * Let's construct some code, which will run in the core main loop.
+ * Some pseudo-code of the main loop will look like this:
+ *   head
+ *   while (...) {
+ *     tail
+ *     head
+ *   }
+ *   tail
+ *
+ * It may look a bit weird, but this setup allows to hide instruction
+ * latencies better and also utilize dual-issue capability more
+ * efficiently (make pairs of load-store and ALU instructions).
+ *
+ * So what we need now is a '*_tail_head' macro, which will be used
+ * in the core main loop. A trivial straightforward implementation
+ * of this macro would look like this:
+ *
+ *   pixman_composite_over_8888_0565_process_pixblock_tail
+ *   vst1.16     {d28, d29}, [DST_W, :128]!
+ *   vld1.16     {d4, d5}, [DST_R, :128]!
+ *   vld4.32     {d0, d1, d2, d3}, [SRC]!
+ *   pixman_composite_over_8888_0565_process_pixblock_head
+ *   cache_preload 8, 8
+ *
+ * Now it also got some VLD/VST instructions. We simply can't move from
+ * processing one block of pixels to the other one with just arithmetics.
+ * The previously processed data needs to be written to memory and new
+ * data needs to be fetched. Fortunately, this main loop does not deal
+ * with partial leading/trailing pixels and can load/store a full block
+ * of pixels in a bulk. Additionally, destination buffer is already
+ * 16 bytes aligned here (which is good for performance).
+ *
+ * New things here are DST_R, DST_W, SRC and MASK identifiers. These
+ * are the aliases for ARM registers which are used as pointers for
+ * accessing data. We maintain separate pointers for reading and writing
+ * destination buffer (DST_R and DST_W).
+ *
+ * Another new thing is 'cache_preload' macro. It is used for prefetching
+ * data into CPU L2 cache and improve performance when dealing with large
+ * images which are far larger than cache size. It uses one argument
+ * (actually two, but they need to be the same here) - number of pixels
+ * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
+ * details about this macro. Moreover, if good performance is needed
+ * the code from this macro needs to be copied into '*_tail_head' macro
+ * and mixed with the rest of code for optimal instructions scheduling.
+ * We are actually doing it below.
+ *
+ * Now after all the explanations, here is the optimized code.
+ * Different instruction streams (originaling from '*_head', '*_tail'
+ * and 'cache_preload' macro) use different indentation levels for
+ * better readability. Actually taking the code from one of these
+ * indentation levels and ignoring a few VLD/VST instructions would
+ * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
+ * macro!
+ */
+
+#if 1
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+        vqadd.u8    d16, d2, d20
+    vld1.16     {d4, d5}, [DST_R, :128]!
+        vqadd.u8    q9, q0, q11
+    vshrn.u16   d6, q2, #8
+    fetch_src_pixblock
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+        vshll.u8    q14, d16, #8
+                                    PF add PF_X, PF_X, #8
+        vshll.u8    q8, d19, #8
+                                    PF tst PF_CTL, #0xF
+    vsri.u8     d6, d6, #5
+                                    PF addne PF_X, PF_X, #8
+    vmvn.8      d3, d3
+                                    PF subne PF_CTL, PF_CTL, #1
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    vmull.u8    q10, d3, d6
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vsri.u16    q14, q8, #5
+                                    PF cmp PF_X, ORIG_W
+        vshll.u8    q9, d18, #8
+    vrshr.u16   q13, q10, #8
+                                    PF subge PF_X, PF_X, ORIG_W
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+                                    PF subges PF_CTL, PF_CTL, #0x10
+        vsri.u16    q14, q9, #11
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vraddhn.u16 d22, q12, q15
+        vst1.16     {d28, d29}, [DST_W, :128]!
+.endm
+
+#else
+
+/* If we did not care much about the performance, we would just use this... */
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+    pixman_composite_over_8888_0565_process_pixblock_tail
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    fetch_src_pixblock
+    pixman_composite_over_8888_0565_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+#endif
+
+/*
+ * And now the final part. We are using 'generate_composite_function' macro
+ * to put all the stuff together. We are specifying the name of the function
+ * which we want to get, number of bits per pixel for the source, mask and
+ * destination (0 if unused, like mask in this case). Next come some bit
+ * flags:
+ *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
+ *                             and written, for write-only buffer we would use
+ *                             FLAG_DST_WRITEONLY flag instead
+ *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
+ *                             and separate color channels for 32bpp format.
+ * The next things are:
+ *  - the number of pixels processed per iteration (8 in this case, because
+ *    that's the maximum what can fit into four 64-bit NEON registers).
+ *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
+ *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
+ *    prefetch distance can be selected by running some benchmarks.
+ *
+ * After that we specify some macros, these are 'default_init',
+ * 'default_cleanup' here which are empty (but it is possible to have custom
+ * init/cleanup macros to be able to save/restore some extra NEON registers
+ * like d8-d15 or do anything else) followed by
+ * 'pixman_composite_over_8888_0565_process_pixblock_head',
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
+ * which we got implemented above.
+ *
+ * The last part is the NEON registers allocation scheme.
+ */
+generate_composite_function \
+    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_0565_process_pixblock_head, \
+    pixman_composite_over_8888_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_0565_process_pixblock_head
+    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+       and put data into d6 - red, d7 - green, d30 - blue */
+    vshrn.u16   d6, q2, #8
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+    vsri.u8     d6, d6, #5
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    /* now do alpha blending, storing results in 8-bit planar format
+       into d16 - red, d19 - green, d18 - blue */
+    vmull.u8    q10, d3, d6
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+    vrshr.u16   q13, q10, #8
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+    vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_n_0565_process_pixblock_tail
+    /* ... continue alpha blending */
+    vqadd.u8    d16, d2, d20
+    vqadd.u8    q9, q0, q11
+    /* convert the result to r5g6b5 and store it into {d28, d29} */
+    vshll.u8    q14, d16, #8
+    vshll.u8    q8, d19, #8
+    vshll.u8    q9, d18, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_0565_process_pixblock_tail_head
+    pixman_composite_over_n_0565_process_pixblock_tail
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    pixman_composite_over_n_0565_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_over_n_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+    vmvn.8      d3, d3      /* invert source alpha */
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_0565_init, \
+    default_cleanup, \
+    pixman_composite_over_n_0565_process_pixblock_head, \
+    pixman_composite_over_n_0565_process_pixblock_tail, \
+    pixman_composite_over_n_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_0565_process_pixblock_head
+    vshll.u8    q8, d1, #8
+    vshll.u8    q14, d2, #8
+    vshll.u8    q9, d0, #8
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
+        vsri.u16    q14, q8, #5
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+    fetch_src_pixblock
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vsri.u16    q14, q9, #11
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vshll.u8    q8, d1, #8
+        vst1.16     {d28, d29}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vshll.u8    q14, d2, #8
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vshll.u8    q9, d0, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_0565_process_pixblock_head, \
+    pixman_composite_src_8888_0565_process_pixblock_tail, \
+    pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_8888_process_pixblock_head
+    vshrn.u16   d30, q0, #8
+    vshrn.u16   d29, q0, #3
+    vsli.u16    q0, q0, #5
+    vmov.u8     d31, #255
+    vsri.u8     d30, d30, #5
+    vsri.u8     d29, d29, #6
+    vshrn.u16   d28, q0, #2
+.endm
+
+.macro pixman_composite_src_0565_8888_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
+    pixman_composite_src_0565_8888_process_pixblock_tail
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+    fetch_src_pixblock
+    pixman_composite_src_0565_8888_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_8888_process_pixblock_head, \
+    pixman_composite_src_0565_8888_process_pixblock_tail, \
+    pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_process_pixblock_head
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail_head
+    fetch_src_pixblock
+                                    PF add PF_X, PF_X, #32
+                                    PF tst PF_CTL, #0xF
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+                                    PF addne PF_X, PF_X, #32
+                                    PF subne PF_CTL, PF_CTL, #1
+        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vqadd.u8    q14, q0, q2
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vqadd.u8    q15, q1, q3
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
+    fetch_src_pixblock
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+    vld1.32     {d4, d5, d6, d7}, [DST_R, :128]!
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vst1.32     {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vqadd.u8    q14, q0, q2
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vqadd.u8    q15, q1, q3
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
+    vmvn.8      d24, d3  /* get inverted alpha */
+    /* do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q14, q8, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q12, q10, #8
+        vrshr.u16   q13, q11, #8
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d30, q12, q10
+        vraddhn.u16 d31, q13, q11
+    fetch_src_pixblock
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vmvn.8      d22, d3
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d22, d4
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d22, d5
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vmull.u8    q10, d22, d6
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vmull.u8    q11, d22, d7
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8888_process_pixblock_head
+    pixman_composite_out_reverse_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q14, q8, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q12, q10, #8
+        vrshr.u16   q13, q11, #8
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d30, q12, q10
+        vraddhn.u16 d31, q13, q11
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    fetch_src_pixblock
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vmvn.8      d22, d3
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d22, d4
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d22, d5
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vmull.u8    q10, d22, d6
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vmull.u8    q11, d22, d7
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8888_process_pixblock_tail_head
+    pixman_composite_over_8888_8888_process_pixblock_tail
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    pixman_composite_over_8888_8888_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_over_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q12, q10, #8
+        vrshr.u16   q13, q11, #8
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d30, q12, q10
+        vraddhn.u16 d31, q13, q11
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    vld4.8      {d0, d1, d2, d3}, [DST_R, :128]!
+    vmvn.8      d22, d3
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d22, d4
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d22, d5
+    vmull.u8    q10, d22, d6
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vmull.u8    q11, d22, d7
+.endm
+
+.macro pixman_composite_over_reverse_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d7[0]}, [DUMMY]
+    vdup.8      d4, d7[0]
+    vdup.8      d5, d7[1]
+    vdup.8      d6, d7[2]
+    vdup.8      d7, d7[3]
+.endm
+
+generate_composite_function \
+    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_reverse_n_8888_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0,  /* dst_r_basereg */ \
+    4,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_head
+    vmull.u8    q0,  d24, d8    /* IN for SRC pixels (part1) */
+    vmull.u8    q1,  d24, d9
+    vmull.u8    q6,  d24, d10
+    vmull.u8    q7,  d24, d11
+        vshrn.u16   d6,  q2, #8 /* convert DST_R data to 32-bpp (part1) */
+        vshrn.u16   d7,  q2, #3
+        vsli.u16    q2,  q2, #5
+    vrshr.u16   q8,  q0,  #8    /* IN for SRC pixels (part2) */
+    vrshr.u16   q9,  q1,  #8
+    vrshr.u16   q10, q6,  #8
+    vrshr.u16   q11, q7,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q9
+    vraddhn.u16 d2,  q6,  q10
+    vraddhn.u16 d3,  q7,  q11
+        vsri.u8     d6,  d6, #5 /* convert DST_R data to 32-bpp (part2) */
+        vsri.u8     d7,  d7, #6
+    vmvn.8      d3,  d3
+        vshrn.u16   d30, q2, #2
+    vmull.u8    q8,  d3, d6     /* now do alpha blending */
+    vmull.u8    q9,  d3, d7
+    vmull.u8    q10, d3, d30
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
+    /* 3 cycle bubble (after vmull.u8) */
+    vrshr.u16   q13, q8,  #8
+    vrshr.u16   q11, q9,  #8
+    vrshr.u16   q15, q10, #8
+    vraddhn.u16 d16, q8,  q13
+    vraddhn.u16 d27, q9,  q11
+    vraddhn.u16 d26, q10, q15
+    vqadd.u8    d16, d2,  d16
+    /* 1 cycle bubble */
+    vqadd.u8    q9,  q0,  q13
+    vshll.u8    q14, d16, #8    /* convert to 16bpp */
+    vshll.u8    q8,  d19, #8
+    vshll.u8    q9,  d18, #8
+    vsri.u16    q14, q8,  #5
+    /* 1 cycle bubble */
+    vsri.u16    q14, q9,  #11
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    vshrn.u16   d6,  q2,  #8
+    fetch_mask_pixblock
+    vshrn.u16   d7,  q2,  #3
+    fetch_src_pixblock
+    vmull.u8    q6,  d24, d10
+        vrshr.u16   q13, q8,  #8
+        vrshr.u16   q11, q9,  #8
+        vrshr.u16   q15, q10, #8
+        vraddhn.u16 d16, q8,  q13
+        vraddhn.u16 d27, q9,  q11
+        vraddhn.u16 d26, q10, q15
+        vqadd.u8    d16, d2,  d16
+    vmull.u8    q1,  d24, d9
+        vqadd.u8    q9,  q0,  q13
+        vshll.u8    q14, d16, #8
+    vmull.u8    q0,  d24, d8
+        vshll.u8    q8,  d19, #8
+        vshll.u8    q9,  d18, #8
+        vsri.u16    q14, q8,  #5
+    vmull.u8    q7,  d24, d11
+        vsri.u16    q14, q9,  #11
+
+    cache_preload 8, 8
+
+    vsli.u16    q2,  q2,  #5
+    vrshr.u16   q8,  q0,  #8
+    vrshr.u16   q9,  q1,  #8
+    vrshr.u16   q10, q6,  #8
+    vrshr.u16   q11, q7,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q9
+    vraddhn.u16 d2,  q6,  q10
+    vraddhn.u16 d3,  q7,  q11
+    vsri.u8     d6,  d6,  #5
+    vsri.u8     d7,  d7,  #6
+    vmvn.8      d3,  d3
+    vshrn.u16   d30, q2,  #2
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    vmull.u8    q8,  d3,  d6
+    vmull.u8    q9,  d3,  d7
+    vmull.u8    q10, d3,  d30
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+/*
+ * This function needs a special initialization of solid mask.
+ * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
+ * offset, split into color components and replicated in d8-d11
+ * registers. Additionally, this function needs all the NEON registers,
+ * so it has to save d8-d15 registers which are callee saved according
+ * to ABI. These registers are restored from 'cleanup' macro. All the
+ * other NEON registers are caller saved, so can be clobbered freely
+ * without introducing any problems.
+ */
+.macro pixman_composite_over_n_8_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8_0565_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_0565_init, \
+    pixman_composite_over_n_8_0565_cleanup, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_0565_init
+    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+    vpush       {d8-d15}
+    vld1.32     {d24[0]}, [DUMMY]
+    vdup.8      d24, d24[3]
+.endm
+
+.macro pixman_composite_over_8888_n_0565_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_8888_n_0565_init, \
+    pixman_composite_over_8888_n_0565_cleanup, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
+    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
+    fetch_src_pixblock
+    cache_preload 16, 16
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    16, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_0565_process_pixblock_head, \
+    pixman_composite_src_0565_0565_process_pixblock_tail, \
+    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail_head
+    vst1.8  {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #8
+    vsli.u64    d0, d0, #16
+    vsli.u64    d0, d0, #32
+    vorr        d1, d0, d0
+    vorr        q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
+    FLAG_DST_WRITEONLY, \
+    32, /* number of pixels, processed in a single block */ \
+    0,  /* prefetch distance */ \
+    pixman_composite_src_n_8_init, \
+    pixman_composite_src_n_8_cleanup, \
+    pixman_composite_src_n_8_process_pixblock_head, \
+    pixman_composite_src_n_8_process_pixblock_tail, \
+    pixman_composite_src_n_8_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail_head
+    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #16
+    vsli.u64    d0, d0, #32
+    vorr        d1, d0, d0
+    vorr        q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_0565_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    16, /* number of pixels, processed in a single block */ \
+    0,  /* prefetch distance */ \
+    pixman_composite_src_n_0565_init, \
+    pixman_composite_src_n_0565_cleanup, \
+    pixman_composite_src_n_0565_process_pixblock_head, \
+    pixman_composite_src_n_0565_process_pixblock_tail, \
+    pixman_composite_src_n_0565_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #32
+    vorr        d1, d0, d0
+    vorr        q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    0, /* prefetch distance */ \
+    pixman_composite_src_n_8888_init, \
+    pixman_composite_src_n_8888_cleanup, \
+    pixman_composite_src_n_8888_process_pixblock_head, \
+    pixman_composite_src_n_8888_process_pixblock_tail, \
+    pixman_composite_src_n_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+    fetch_src_pixblock
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_8888_process_pixblock_head, \
+    pixman_composite_src_8888_8888_process_pixblock_tail, \
+    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_x888_8888_process_pixblock_head
+    vorr     q0, q0, q2
+    vorr     q1, q1, q2
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+    fetch_src_pixblock
+    vorr     q0, q0, q2
+    vorr     q1, q1, q2
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_x888_8888_init
+    vmov.u8  q2, #0xFF
+    vshl.u32 q2, q2, #24
+.endm
+
+generate_composite_function \
+    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    pixman_composite_src_x888_8888_init, \
+    default_cleanup, \
+    pixman_composite_src_x888_8888_process_pixblock_head, \
+    pixman_composite_src_x888_8888_process_pixblock_tail, \
+    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_head
+    /* expecting deinterleaved source data in {d8, d9, d10, d11} */
+    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
+    /* and destination data in {d4, d5, d6, d7} */
+    /* mask is in d24 (d25, d26, d27 are unused) */
+
+    /* in */
+    vmull.u8    q0, d24, d8
+    vmull.u8    q1, d24, d9
+    vmull.u8    q6, d24, d10
+    vmull.u8    q7, d24, d11
+    vrshr.u16   q10, q0, #8
+    vrshr.u16   q11, q1, #8
+    vrshr.u16   q12, q6, #8
+    vrshr.u16   q13, q7, #8
+    vraddhn.u16 d0, q0, q10
+    vraddhn.u16 d1, q1, q11
+    vraddhn.u16 d2, q6, q12
+    vraddhn.u16 d3, q7, q13
+    vmvn.8      d24, d3  /* get inverted alpha */
+    /* source:      d0 - blue, d1 - green, d2 - red, d3 - alpha */
+    /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
+    pixman_composite_over_n_8_8888_process_pixblock_tail
+    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    fetch_mask_pixblock
+    cache_preload 8, 8
+    pixman_composite_over_n_8_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_8_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8_8888_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_8888_init, \
+    pixman_composite_over_n_8_8888_cleanup, \
+    pixman_composite_over_n_8_8888_process_pixblock_head, \
+    pixman_composite_over_n_8_8888_process_pixblock_tail, \
+    pixman_composite_over_n_8_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8_process_pixblock_head
+    vmull.u8    q0,  d24, d8
+    vmull.u8    q1,  d25, d8
+    vmull.u8    q6,  d26, d8
+    vmull.u8    q7,  d27, d8
+    vrshr.u16   q10, q0,  #8
+    vrshr.u16   q11, q1,  #8
+    vrshr.u16   q12, q6,  #8
+    vrshr.u16   q13, q7,  #8
+    vraddhn.u16 d0,  q0,  q10
+    vraddhn.u16 d1,  q1,  q11
+    vraddhn.u16 d2,  q6,  q12
+    vraddhn.u16 d3,  q7,  q13
+    vmvn.8      q12, q0
+    vmvn.8      q13, q1
+    vmull.u8    q8,  d24, d4
+    vmull.u8    q9,  d25, d5
+    vmull.u8    q10, d26, d6
+    vmull.u8    q11, d27, d7
+.endm
+
+.macro pixman_composite_over_n_8_8_process_pixblock_tail
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+    vqadd.u8    q14, q0,  q14
+    vqadd.u8    q15, q1,  q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_n_8_8_process_pixblock_tail
+    fetch_mask_pixblock
+    cache_preload 32, 32
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    pixman_composite_over_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_8_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d8[0]}, [DUMMY]
+    vdup.8      d8, d8[3]
+.endm
+
+.macro pixman_composite_over_n_8_8_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_8_init, \
+    pixman_composite_over_n_8_8_cleanup, \
+    pixman_composite_over_n_8_8_process_pixblock_head, \
+    pixman_composite_over_n_8_8_process_pixblock_tail, \
+    pixman_composite_over_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+    /*
+     * 'combine_mask_ca' replacement
+     *
+     * input:  solid src (n) in {d8,  d9,  d10, d11}
+     *         dest in          {d4,  d5,  d6,  d7 }
+     *         mask in          {d24, d25, d26, d27}
+     * output: updated src in   {d0,  d1,  d2,  d3 }
+     *         updated mask in  {d24, d25, d26, d3 }
+     */
+    vmull.u8    q0,  d24, d8
+    vmull.u8    q1,  d25, d9
+    vmull.u8    q6,  d26, d10
+    vmull.u8    q7,  d27, d11
+    vmull.u8    q9,  d11, d25
+    vmull.u8    q12, d11, d24
+    vmull.u8    q13, d11, d26
+    vrshr.u16   q8,  q0,  #8
+    vrshr.u16   q10, q1,  #8
+    vrshr.u16   q11, q6,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q10
+    vraddhn.u16 d2,  q6,  q11
+    vrshr.u16   q11, q12, #8
+    vrshr.u16   q8,  q9,  #8
+    vrshr.u16   q6,  q13, #8
+    vrshr.u16   q10, q7,  #8
+    vraddhn.u16 d24, q12, q11
+    vraddhn.u16 d25, q9,  q8
+    vraddhn.u16 d26, q13, q6
+    vraddhn.u16 d3,  q7,  q10
+    /*
+     * 'combine_over_ca' replacement
+     *
+     * output: updated dest in {d28, d29, d30, d31}
+     */
+    vmvn.8      d24, d24
+    vmvn.8      d25, d25
+    vmull.u8    q8,  d24, d4
+    vmull.u8    q9,  d25, d5
+    vmvn.8      d26, d26
+    vmvn.8      d27, d3
+    vmull.u8    q10, d26, d6
+    vmull.u8    q11, d27, d7
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
+    /* ... continue 'combine_over_ca' replacement */
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q6,  q10, #8
+    vrshr.u16   q7,  q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q6,  q10
+    vraddhn.u16 d31, q7,  q11
+    vqadd.u8    q14, q0,  q14
+    vqadd.u8    q15, q1,  q15
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+        vrshr.u16   q15, q9, #8
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q6, q10, #8
+        vrshr.u16   q7, q11, #8
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+        vraddhn.u16 d30, q6, q10
+        vraddhn.u16 d31, q7, q11
+    fetch_mask_pixblock
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    cache_preload 8, 8
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_8888_ca_init, \
+    pixman_composite_over_n_8888_8888_ca_cleanup, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_in_n_8_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* and destination data in {d4, d5, d6, d7} */
+    vmull.u8    q8,  d4,  d3
+    vmull.u8    q9,  d5,  d3
+    vmull.u8    q10, d6,  d3
+    vmull.u8    q11, d7,  d3
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q8,  q14
+    vraddhn.u16 d29, q9,  q15
+    vraddhn.u16 d30, q10, q12
+    vraddhn.u16 d31, q11, q13
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail_head
+    pixman_composite_in_n_8_process_pixblock_tail
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    cache_preload 32, 32
+    pixman_composite_in_n_8_process_pixblock_head
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_in_n_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d3, d3[3]
+.endm
+
+.macro pixman_composite_in_n_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_in_n_8_init, \
+    pixman_composite_in_n_8_cleanup, \
+    pixman_composite_in_n_8_process_pixblock_head, \
+    pixman_composite_in_n_8_process_pixblock_tail, \
+    pixman_composite_in_n_8_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+.macro pixman_composite_add_n_8_8_process_pixblock_head
+    /* expecting source data in {d8, d9, d10, d11} */
+    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
+    /* and destination data in {d4, d5, d6, d7} */
+    /* mask is in d24, d25, d26, d27 */
+    vmull.u8    q0, d24, d11
+    vmull.u8    q1, d25, d11
+    vmull.u8    q6, d26, d11
+    vmull.u8    q7, d27, d11
+    vrshr.u16   q10, q0, #8
+    vrshr.u16   q11, q1, #8
+    vrshr.u16   q12, q6, #8
+    vrshr.u16   q13, q7, #8
+    vraddhn.u16 d0, q0, q10
+    vraddhn.u16 d1, q1, q11
+    vraddhn.u16 d2, q6, q12
+    vraddhn.u16 d3, q7, q13
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_n_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
+    pixman_composite_add_n_8_8_process_pixblock_tail
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    fetch_mask_pixblock
+    cache_preload 32, 32
+    pixman_composite_add_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_n_8_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_add_n_8_8_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_n_8_8_init, \
+    pixman_composite_add_n_8_8_cleanup, \
+    pixman_composite_add_n_8_8_process_pixblock_head, \
+    pixman_composite_add_n_8_8_process_pixblock_tail, \
+    pixman_composite_add_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_8_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* mask in {d24, d25, d26, d27} */
+    vmull.u8    q8, d24, d0
+    vmull.u8    q9, d25, d1
+    vmull.u8    q10, d26, d2
+    vmull.u8    q11, d27, d3
+    vrshr.u16   q0, q8, #8
+    vrshr.u16   q1, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d0, q0, q8
+    vraddhn.u16 d1, q1, q9
+    vraddhn.u16 d2, q12, q10
+    vraddhn.u16 d3, q13, q11
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
+    pixman_composite_add_8_8_8_process_pixblock_tail
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    fetch_mask_pixblock
+    fetch_src_pixblock
+    cache_preload 32, 32
+    pixman_composite_add_8_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_8_8_8_init
+.endm
+
+.macro pixman_composite_add_8_8_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_8_8_8_init, \
+    pixman_composite_add_8_8_8_cleanup, \
+    pixman_composite_add_8_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_8_process_pixblock_tail, \
+    pixman_composite_add_8_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* mask in {d24, d25, d26, d27} */
+    vmull.u8    q8,  d27, d0
+    vmull.u8    q9,  d27, d1
+    vmull.u8    q10, d27, d2
+    vmull.u8    q11, d27, d3
+    /* 1 cycle bubble */
+    vrsra.u16   q8,  q8,  #8
+    vrsra.u16   q9,  q9,  #8
+    vrsra.u16   q10, q10, #8
+    vrsra.u16   q11, q11, #8
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
+    /* 2 cycle bubble */
+    vrshrn.u16  d28, q8,  #8
+    vrshrn.u16  d29, q9,  #8
+    vrshrn.u16  d30, q10, #8
+    vrshrn.u16  d31, q11, #8
+    vqadd.u8    q14, q2,  q14
+    /* 1 cycle bubble */
+    vqadd.u8    q15, q3,  q15
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+    fetch_src_pixblock
+        vrshrn.u16  d28, q8,  #8
+    fetch_mask_pixblock
+        vrshrn.u16  d29, q9,  #8
+    vmull.u8    q8,  d27, d0
+        vrshrn.u16  d30, q10, #8
+    vmull.u8    q9,  d27, d1
+        vrshrn.u16  d31, q11, #8
+    vmull.u8    q10, d27, d2
+        vqadd.u8    q14, q2,  q14
+    vmull.u8    q11, d27, d3
+        vqadd.u8    q15, q3,  q15
+    vrsra.u16   q8,  q8,  #8
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    vrsra.u16   q9,  q9,  #8
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vrsra.u16   q10, q10, #8
+
+    cache_preload 8, 8
+
+    vrsra.u16   q11, q11, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+generate_composite_function \
+    pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_n_8_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+.endm
+
+.macro pixman_composite_add_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_n_8_8888_init, \
+    pixman_composite_add_n_8_8888_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_n_8888_init
+    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+    vld1.32     {d27[0]}, [DUMMY]
+    vdup.8      d27, d27[3]
+.endm
+
+.macro pixman_composite_add_8888_n_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_8888_n_8888_init, \
+    pixman_composite_add_8888_n_8888_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* solid mask is in d15 */
+
+    /* 'in' */
+    vmull.u8    q8, d15, d3
+    vmull.u8    q6, d15, d2
+    vmull.u8    q5, d15, d1
+    vmull.u8    q4, d15, d0
+    vrshr.u16   q13, q8, #8
+    vrshr.u16   q12, q6, #8
+    vrshr.u16   q11, q5, #8
+    vrshr.u16   q10, q4, #8
+    vraddhn.u16 d3, q8, q13
+    vraddhn.u16 d2, q6, q12
+    vraddhn.u16 d1, q5, q11
+    vraddhn.u16 d0, q4, q10
+    vmvn.8      d24, d3  /* get inverted alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_head
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_8888_n_8888_init
+    add         DUMMY, sp, #48
+    vpush       {d8-d15}
+    vld1.32     {d15[0]}, [DUMMY]
+    vdup.8      d15, d15[3]
+.endm
+
+.macro pixman_composite_over_8888_n_8888_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_8888_n_8888_init, \
+    pixman_composite_over_8888_n_8888_cleanup, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
+    vst3.8 {d0, d1, d2}, [DST_W]!
+    fetch_src_pixblock
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_0888_process_pixblock_head, \
+    pixman_composite_src_0888_0888_process_pixblock_tail, \
+    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
+    vswp   d0, d2
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
+    vst4.8 {d0, d1, d2, d3}, [DST_W]!
+    fetch_src_pixblock
+    vswp   d0, d2
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_init
+    veor   d3, d3, d3
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    pixman_composite_src_0888_8888_rev_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
+    vshll.u8    q8, d1, #8
+    vshll.u8    q9, d2, #8
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
+    vshll.u8    q14, d0, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
+        vshll.u8    q14, d0, #8
+    fetch_src_pixblock
+        vsri.u16    q14, q8, #5
+        vsri.u16    q14, q9, #11
+    vshll.u8    q8, d1, #8
+        vst1.16 {d28, d29}, [DST_W, :128]!
+    vshll.u8    q9, d2, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
+    vrshr.u16   q11, q8, #8
+    vswp        d3, d31
+    vrshr.u16   q12, q9, #8
+    vrshr.u16   q13, q10, #8
+    vraddhn.u16 d30, q11, q8
+    vraddhn.u16 d29, q12, q9
+    vraddhn.u16 d28, q13, q10
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
+        vrshr.u16   q11, q8, #8
+        vswp        d3, d31
+        vrshr.u16   q12, q9, #8
+        vrshr.u16   q13, q10, #8
+    fetch_src_pixblock
+        vraddhn.u16 d30, q11, q8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d29, q12, q9
+        vraddhn.u16 d28, q13, q10
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endm
+
+generate_composite_function \
+    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
+    vrshr.u16   q11, q8, #8
+    vswp        d3, d31
+    vrshr.u16   q12, q9, #8
+    vrshr.u16   q13, q10, #8
+    vraddhn.u16 d28, q11, q8
+    vraddhn.u16 d29, q12, q9
+    vraddhn.u16 d30, q13, q10
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
+        vrshr.u16   q11, q8, #8
+        vswp        d3, d31
+        vrshr.u16   q12, q9, #8
+        vrshr.u16   q13, q10, #8
+    fetch_src_pixblock
+        vraddhn.u16 d28, q11, q8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d29, q12, q9
+        vraddhn.u16 d30, q13, q10
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endm
+
+generate_composite_function \
+    pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_head
+    /* mask is in d15 */
+    convert_0565_to_x888 q4, d2, d1, d0
+    convert_0565_to_x888 q5, d6, d5, d4
+    /* source pixel data is in      {d0, d1, d2, XX} */
+    /* destination pixel data is in {d4, d5, d6, XX} */
+    vmvn.8      d7,  d15
+    vmull.u8    q6,  d15, d2
+    vmull.u8    q5,  d15, d1
+    vmull.u8    q4,  d15, d0
+    vmull.u8    q8,  d7,  d4
+    vmull.u8    q9,  d7,  d5
+    vmull.u8    q13, d7,  d6
+    vrshr.u16   q12, q6,  #8
+    vrshr.u16   q11, q5,  #8
+    vrshr.u16   q10, q4,  #8
+    vraddhn.u16 d2,  q6,  q12
+    vraddhn.u16 d1,  q5,  q11
+    vraddhn.u16 d0,  q4,  q10
+.endm
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q12, q13, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q13
+    vqadd.u8    q0,  q0,  q14
+    vqadd.u8    q1,  q1,  q15
+    /* 32bpp result is in {d0, d1, d2, XX} */
+    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
+    fetch_mask_pixblock
+    pixman_composite_over_0565_8_0565_process_pixblock_tail
+    fetch_src_pixblock
+    vld1.16    {d10, d11}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_over_0565_8_0565_process_pixblock_head
+    vst1.16    {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_n_0565_init
+    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+    vpush       {d8-d15}
+    vld1.32     {d15[0]}, [DUMMY]
+    vdup.8      d15, d15[3]
+.endm
+
+.macro pixman_composite_over_0565_n_0565_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_0565_n_0565_init, \
+    pixman_composite_over_0565_n_0565_cleanup, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10, /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_head
+    /* mask is in d15 */
+    convert_0565_to_x888 q4, d2, d1, d0
+    convert_0565_to_x888 q5, d6, d5, d4
+    /* source pixel data is in      {d0, d1, d2, XX} */
+    /* destination pixel data is in {d4, d5, d6, XX} */
+    vmull.u8    q6,  d15, d2
+    vmull.u8    q5,  d15, d1
+    vmull.u8    q4,  d15, d0
+    vrshr.u16   q12, q6,  #8
+    vrshr.u16   q11, q5,  #8
+    vrshr.u16   q10, q4,  #8
+    vraddhn.u16 d2,  q6,  q12
+    vraddhn.u16 d1,  q5,  q11
+    vraddhn.u16 d0,  q4,  q10
+.endm
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
+    vqadd.u8    q0,  q0,  q2
+    vqadd.u8    q1,  q1,  q3
+    /* 32bpp result is in {d0, d1, d2, XX} */
+    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
+    fetch_mask_pixblock
+    pixman_composite_add_0565_8_0565_process_pixblock_tail
+    fetch_src_pixblock
+    vld1.16    {d10, d11}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_add_0565_8_0565_process_pixblock_head
+    vst1.16    {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_add_0565_8_0565_process_pixblock_head, \
+    pixman_composite_add_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10, /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
+    /* mask is in d15 */
+    convert_0565_to_x888 q5, d6, d5, d4
+    /* destination pixel data is in {d4, d5, d6, xx} */
+    vmvn.8      d24, d15 /* get inverted alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+.endm
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vraddhn.u16 d0, q14, q8
+    vraddhn.u16 d1, q15, q9
+    vraddhn.u16 d2, q12, q10
+    /* 32bpp result is in {d0, d1, d2, XX} */
+    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
+    fetch_src_pixblock
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail
+    vld1.16    {d10, d11}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_out_reverse_8_0565_process_pixblock_head
+    vst1.16    {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_head, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10, /* dst_r_basereg */ \
+    15, /* src_basereg   */ \
+    0   /* mask_basereg  */
+
+/******************************************************************************/
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_0565_process_pixblock_head, \
+    pixman_composite_over_8888_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_0565_process_pixblock_head, \
+    pixman_composite_src_8888_0565_process_pixblock_tail, \
+    pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_8888_process_pixblock_head, \
+    pixman_composite_src_0565_8888_process_pixblock_tail, \
+    pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
diff --git a/pixman/pixman/pixman-arm-neon.c b/pixman/pixman/pixman-arm-neon.c
index 7d6c83775..3e0c0d1c2 100644
--- a/pixman/pixman/pixman-arm-neon.c
+++ b/pixman/pixman/pixman-arm-neon.c
@@ -122,6 +122,11 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, SRC,
 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 0565_8888, SRC,
                                         uint16_t, uint32_t)
 
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_0565,
+                                           OVER, uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 0565_8_0565,
+                                           OVER, uint16_t, uint16_t)
+
 void
 pixman_composite_src_n_8_asm_neon (int32_t   w,
                                    int32_t   h,
@@ -332,6 +337,12 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
     SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, neon_0565_8888),
     SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, neon_0565_8888),
 
+    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_8_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_8_0565),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, neon_0565_8_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, neon_0565_8_0565),
+
     { PIXMAN_OP_NONE },
 };
 
diff --git a/pixman/pixman/pixman-fast-path.c b/pixman/pixman/pixman-fast-path.c
index 4cb8321aa..92f030871 100644
--- a/pixman/pixman/pixman-fast-path.c
+++ b/pixman/pixman/pixman-fast-path.c
@@ -1,2227 +1,2228 @@
-/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission.  SuSE makes no representations about the
- * suitability of this software for any purpose.  It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * Author:  Keith Packard, SuSE, Inc.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <string.h>
-#include <stdlib.h>
-#include "pixman-private.h"
-#include "pixman-combine32.h"
-#include "pixman-fast-path.h"
-
-static force_inline uint32_t
-fetch_24 (uint8_t *a)
-{
-    if (((unsigned long)a) & 1)
-    {
-#ifdef WORDS_BIGENDIAN
-	return (*a << 16) | (*(uint16_t *)(a + 1));
-#else
-	return *a | (*(uint16_t *)(a + 1) << 8);
-#endif
-    }
-    else
-    {
-#ifdef WORDS_BIGENDIAN
-	return (*(uint16_t *)a << 8) | *(a + 2);
-#else
-	return *(uint16_t *)a | (*(a + 2) << 16);
-#endif
-    }
-}
-
-static force_inline void
-store_24 (uint8_t *a,
-          uint32_t v)
-{
-    if (((unsigned long)a) & 1)
-    {
-#ifdef WORDS_BIGENDIAN
-	*a = (uint8_t) (v >> 16);
-	*(uint16_t *)(a + 1) = (uint16_t) (v);
-#else
-	*a = (uint8_t) (v);
-	*(uint16_t *)(a + 1) = (uint16_t) (v >> 8);
-#endif
-    }
-    else
-    {
-#ifdef WORDS_BIGENDIAN
-	*(uint16_t *)a = (uint16_t)(v >> 8);
-	*(a + 2) = (uint8_t)v;
-#else
-	*(uint16_t *)a = (uint16_t)v;
-	*(a + 2) = (uint8_t)(v >> 16);
-#endif
-    }
-}
-
-static force_inline uint32_t
-over (uint32_t src,
-      uint32_t dest)
-{
-    uint32_t a = ~src >> 24;
-
-    UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src);
-
-    return dest;
-}
-
-static uint32_t
-in (uint32_t x,
-    uint8_t  y)
-{
-    uint16_t a = y;
-
-    UN8x4_MUL_UN8 (x, a);
-
-    return x;
-}
-
-/*
- * Naming convention:
- *
- *  op_src_mask_dest
- */
-static void
-fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 pixman_image_t *         src_image,
-                                 pixman_image_t *         mask_image,
-                                 pixman_image_t *         dst_image,
-                                 int32_t                  src_x,
-                                 int32_t                  src_y,
-                                 int32_t                  mask_x,
-                                 int32_t                  mask_y,
-                                 int32_t                  dest_x,
-                                 int32_t                  dest_y,
-                                 int32_t                  width,
-                                 int32_t                  height)
-{
-    uint32_t    *src, *src_line;
-    uint32_t    *dst, *dst_line;
-    uint8_t     *mask, *mask_line;
-    int src_stride, mask_stride, dst_stride;
-    uint8_t m;
-    uint32_t s, d;
-    int32_t w;
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    while (height--)
-    {
-	src = src_line;
-	src_line += src_stride;
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-
-	w = width;
-	while (w--)
-	{
-	    m = *mask++;
-	    if (m)
-	    {
-		s = *src | 0xff000000;
-
-		if (m == 0xff)
-		{
-		    *dst = s;
-		}
-		else
-		{
-		    d = in (s, m);
-		    *dst = over (d, *dst);
-		}
-	    }
-	    src++;
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_in_n_8_8 (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         pixman_image_t *         src_image,
-                         pixman_image_t *         mask_image,
-                         pixman_image_t *         dest_image,
-                         int32_t                  src_x,
-                         int32_t                  src_y,
-                         int32_t                  mask_x,
-                         int32_t                  mask_y,
-                         int32_t                  dest_x,
-                         int32_t                  dest_y,
-                         int32_t                  width,
-                         int32_t                  height)
-{
-    uint32_t src, srca;
-    uint8_t     *dst_line, *dst;
-    uint8_t     *mask_line, *mask, m;
-    int dst_stride, mask_stride;
-    int32_t w;
-    uint16_t t;
-
-    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
-
-    srca = src >> 24;
-
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
-    if (srca == 0xff)
-    {
-	while (height--)
-	{
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    mask = mask_line;
-	    mask_line += mask_stride;
-	    w = width;
-
-	    while (w--)
-	    {
-		m = *mask++;
-
-		if (m == 0)
-		    *dst = 0;
-		else if (m != 0xff)
-		    *dst = MUL_UN8 (m, *dst, t);
-
-		dst++;
-	    }
-	}
-    }
-    else
-    {
-	while (height--)
-	{
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    mask = mask_line;
-	    mask_line += mask_stride;
-	    w = width;
-
-	    while (w--)
-	    {
-		m = *mask++;
-		m = MUL_UN8 (m, srca, t);
-
-		if (m == 0)
-		    *dst = 0;
-		else if (m != 0xff)
-		    *dst = MUL_UN8 (m, *dst, t);
-
-		dst++;
-	    }
-	}
-    }
-}
-
-static void
-fast_composite_in_8_8 (pixman_implementation_t *imp,
-                       pixman_op_t              op,
-                       pixman_image_t *         src_image,
-                       pixman_image_t *         mask_image,
-                       pixman_image_t *         dest_image,
-                       int32_t                  src_x,
-                       int32_t                  src_y,
-                       int32_t                  mask_x,
-                       int32_t                  mask_y,
-                       int32_t                  dest_x,
-                       int32_t                  dest_y,
-                       int32_t                  width,
-                       int32_t                  height)
-{
-    uint8_t     *dst_line, *dst;
-    uint8_t     *src_line, *src;
-    int dst_stride, src_stride;
-    int32_t w;
-    uint8_t s;
-    uint16_t t;
-
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	{
-	    s = *src++;
-
-	    if (s == 0)
-		*dst = 0;
-	    else if (s != 0xff)
-		*dst = MUL_UN8 (s, *dst, t);
-
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
-{
-    uint32_t src, srca;
-    uint32_t    *dst_line, *dst, d;
-    uint8_t     *mask_line, *mask, m;
-    int dst_stride, mask_stride;
-    int32_t w;
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    srca = src >> 24;
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-	while (w--)
-	{
-	    m = *mask++;
-	    if (m == 0xff)
-	    {
-		if (srca == 0xff)
-		    *dst = src;
-		else
-		    *dst = over (src, *dst);
-	    }
-	    else if (m)
-	    {
-		d = in (src, m);
-		*dst = over (d, *dst);
-	    }
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
-				   pixman_op_t              op,
-				   pixman_image_t *         src_image,
-				   pixman_image_t *         mask_image,
-				   pixman_image_t *         dst_image,
-				   int32_t                  src_x,
-				   int32_t                  src_y,
-				   int32_t                  mask_x,
-				   int32_t                  mask_y,
-				   int32_t                  dest_x,
-				   int32_t                  dest_y,
-				   int32_t                  width,
-				   int32_t                  height)
-{
-    uint32_t src, s;
-    uint32_t    *dst_line, *dst, d;
-    uint32_t    *mask_line, *mask, ma;
-    int dst_stride, mask_stride;
-    int32_t w;
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-	while (w--)
-	{
-	    ma = *mask++;
-
-	    if (ma)
-	    {
-		d = *dst;
-		s = src;
-
-		UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d);
-
-		*dst = s;
-	    }
-
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
-                                    pixman_op_t              op,
-                                    pixman_image_t *         src_image,
-                                    pixman_image_t *         mask_image,
-                                    pixman_image_t *         dst_image,
-                                    int32_t                  src_x,
-                                    int32_t                  src_y,
-                                    int32_t                  mask_x,
-                                    int32_t                  mask_y,
-                                    int32_t                  dest_x,
-                                    int32_t                  dest_y,
-                                    int32_t                  width,
-                                    int32_t                  height)
-{
-    uint32_t src, srca, s;
-    uint32_t    *dst_line, *dst, d;
-    uint32_t    *mask_line, *mask, ma;
-    int dst_stride, mask_stride;
-    int32_t w;
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    srca = src >> 24;
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-	while (w--)
-	{
-	    ma = *mask++;
-	    if (ma == 0xffffffff)
-	    {
-		if (srca == 0xff)
-		    *dst = src;
-		else
-		    *dst = over (src, *dst);
-	    }
-	    else if (ma)
-	    {
-		d = *dst;
-		s = src;
-
-		UN8x4_MUL_UN8x4 (s, ma);
-		UN8x4_MUL_UN8 (ma, srca);
-		ma = ~ma;
-		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
-
-		*dst = d;
-	    }
-
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
-{
-    uint32_t src, srca;
-    uint8_t     *dst_line, *dst;
-    uint32_t d;
-    uint8_t     *mask_line, *mask, m;
-    int dst_stride, mask_stride;
-    int32_t w;
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    srca = src >> 24;
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-	while (w--)
-	{
-	    m = *mask++;
-	    if (m == 0xff)
-	    {
-		if (srca == 0xff)
-		{
-		    d = src;
-		}
-		else
-		{
-		    d = fetch_24 (dst);
-		    d = over (src, d);
-		}
-		store_24 (dst, d);
-	    }
-	    else if (m)
-	    {
-		d = over (in (src, m), fetch_24 (dst));
-		store_24 (dst, d);
-	    }
-	    dst += 3;
-	}
-    }
-}
-
-static void
-fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
-{
-    uint32_t src, srca;
-    uint16_t    *dst_line, *dst;
-    uint32_t d;
-    uint8_t     *mask_line, *mask, m;
-    int dst_stride, mask_stride;
-    int32_t w;
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    srca = src >> 24;
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-	while (w--)
-	{
-	    m = *mask++;
-	    if (m == 0xff)
-	    {
-		if (srca == 0xff)
-		{
-		    d = src;
-		}
-		else
-		{
-		    d = *dst;
-		    d = over (src, CONVERT_0565_TO_0888 (d));
-		}
-		*dst = CONVERT_8888_TO_0565 (d);
-	    }
-	    else if (m)
-	    {
-		d = *dst;
-		d = over (in (src, m), CONVERT_0565_TO_0888 (d));
-		*dst = CONVERT_8888_TO_0565 (d);
-	    }
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
-                                    pixman_op_t              op,
-                                    pixman_image_t *         src_image,
-                                    pixman_image_t *         mask_image,
-                                    pixman_image_t *         dst_image,
-                                    int32_t                  src_x,
-                                    int32_t                  src_y,
-                                    int32_t                  mask_x,
-                                    int32_t                  mask_y,
-                                    int32_t                  dest_x,
-                                    int32_t                  dest_y,
-                                    int32_t                  width,
-                                    int32_t                  height)
-{
-    uint32_t  src, srca, s;
-    uint16_t  src16;
-    uint16_t *dst_line, *dst;
-    uint32_t  d;
-    uint32_t *mask_line, *mask, ma;
-    int dst_stride, mask_stride;
-    int32_t w;
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    srca = src >> 24;
-    if (src == 0)
-	return;
-
-    src16 = CONVERT_8888_TO_0565 (src);
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-	while (w--)
-	{
-	    ma = *mask++;
-	    if (ma == 0xffffffff)
-	    {
-		if (srca == 0xff)
-		{
-		    *dst = src16;
-		}
-		else
-		{
-		    d = *dst;
-		    d = over (src, CONVERT_0565_TO_0888 (d));
-		    *dst = CONVERT_8888_TO_0565 (d);
-		}
-	    }
-	    else if (ma)
-	    {
-		d = *dst;
-		d = CONVERT_0565_TO_0888 (d);
-
-		s = src;
-
-		UN8x4_MUL_UN8x4 (s, ma);
-		UN8x4_MUL_UN8 (ma, srca);
-		ma = ~ma;
-		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
-
-		*dst = CONVERT_8888_TO_0565 (d);
-	    }
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_over_8888_8888 (pixman_implementation_t *imp,
-                               pixman_op_t              op,
-                               pixman_image_t *         src_image,
-                               pixman_image_t *         mask_image,
-                               pixman_image_t *         dst_image,
-                               int32_t                  src_x,
-                               int32_t                  src_y,
-                               int32_t                  mask_x,
-                               int32_t                  mask_y,
-                               int32_t                  dest_x,
-                               int32_t                  dest_y,
-                               int32_t                  width,
-                               int32_t                  height)
-{
-    uint32_t    *dst_line, *dst;
-    uint32_t    *src_line, *src, s;
-    int dst_stride, src_stride;
-    uint8_t a;
-    int32_t w;
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	{
-	    s = *src++;
-	    a = s >> 24;
-	    if (a == 0xff)
-		*dst = s;
-	    else if (s)
-		*dst = over (s, *dst);
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_src_x888_8888 (pixman_implementation_t *imp,
-			      pixman_op_t              op,
-			      pixman_image_t *         src_image,
-			      pixman_image_t *         mask_image,
-			      pixman_image_t *         dst_image,
-			      int32_t                  src_x,
-			      int32_t                  src_y,
-			      int32_t                  mask_x,
-			      int32_t                  mask_y,
-			      int32_t                  dest_x,
-			      int32_t                  dest_y,
-			      int32_t                  width,
-			      int32_t                  height)
-{
-    uint32_t    *dst_line, *dst;
-    uint32_t    *src_line, *src;
-    int dst_stride, src_stride;
-    int32_t w;
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	    *dst++ = (*src++) | 0xff000000;
-    }
-}
-
-#if 0
-static void
-fast_composite_over_8888_0888 (pixman_implementation_t *imp,
-			       pixman_op_t              op,
-			       pixman_image_t *         src_image,
-			       pixman_image_t *         mask_image,
-			       pixman_image_t *         dst_image,
-			       int32_t                  src_x,
-			       int32_t                  src_y,
-			       int32_t                  mask_x,
-			       int32_t                  mask_y,
-			       int32_t                  dest_x,
-			       int32_t                  dest_y,
-			       int32_t                  width,
-			       int32_t                  height)
-{
-    uint8_t     *dst_line, *dst;
-    uint32_t d;
-    uint32_t    *src_line, *src, s;
-    uint8_t a;
-    int dst_stride, src_stride;
-    int32_t w;
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	{
-	    s = *src++;
-	    a = s >> 24;
-	    if (a)
-	    {
-		if (a == 0xff)
-		    d = s;
-		else
-		    d = over (s, fetch_24 (dst));
-
-		store_24 (dst, d);
-	    }
-	    dst += 3;
-	}
-    }
-}
-#endif
-
-static void
-fast_composite_over_8888_0565 (pixman_implementation_t *imp,
-                               pixman_op_t              op,
-                               pixman_image_t *         src_image,
-                               pixman_image_t *         mask_image,
-                               pixman_image_t *         dst_image,
-                               int32_t                  src_x,
-                               int32_t                  src_y,
-                               int32_t                  mask_x,
-                               int32_t                  mask_y,
-                               int32_t                  dest_x,
-                               int32_t                  dest_y,
-                               int32_t                  width,
-                               int32_t                  height)
-{
-    uint16_t    *dst_line, *dst;
-    uint32_t d;
-    uint32_t    *src_line, *src, s;
-    uint8_t a;
-    int dst_stride, src_stride;
-    int32_t w;
-
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	{
-	    s = *src++;
-	    a = s >> 24;
-	    if (s)
-	    {
-		if (a == 0xff)
-		{
-		    d = s;
-		}
-		else
-		{
-		    d = *dst;
-		    d = over (s, CONVERT_0565_TO_0888 (d));
-		}
-		*dst = CONVERT_8888_TO_0565 (d);
-	    }
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_src_x888_0565 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
-{
-    uint16_t    *dst_line, *dst;
-    uint32_t    *src_line, *src, s;
-    int dst_stride, src_stride;
-    int32_t w;
-
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	{
-	    s = *src++;
-	    *dst = CONVERT_8888_TO_0565 (s);
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_add_8_8 (pixman_implementation_t *imp,
-			pixman_op_t              op,
-			pixman_image_t *         src_image,
-			pixman_image_t *         mask_image,
-			pixman_image_t *         dst_image,
-			int32_t                  src_x,
-			int32_t                  src_y,
-			int32_t                  mask_x,
-			int32_t                  mask_y,
-			int32_t                  dest_x,
-			int32_t                  dest_y,
-			int32_t                  width,
-			int32_t                  height)
-{
-    uint8_t     *dst_line, *dst;
-    uint8_t     *src_line, *src;
-    int dst_stride, src_stride;
-    int32_t w;
-    uint8_t s, d;
-    uint16_t t;
-
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	{
-	    s = *src++;
-	    if (s)
-	    {
-		if (s != 0xff)
-		{
-		    d = *dst;
-		    t = d + s;
-		    s = t | (0 - (t >> 8));
-		}
-		*dst = s;
-	    }
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_add_8888_8888 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
-{
-    uint32_t    *dst_line, *dst;
-    uint32_t    *src_line, *src;
-    int dst_stride, src_stride;
-    int32_t w;
-    uint32_t s, d;
-
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	{
-	    s = *src++;
-	    if (s)
-	    {
-		if (s != 0xffffffff)
-		{
-		    d = *dst;
-		    if (d)
-			UN8x4_ADD_UN8x4 (s, d);
-		}
-		*dst = s;
-	    }
-	    dst++;
-	}
-    }
-}
-
-static void
-fast_composite_add_n_8_8 (pixman_implementation_t *imp,
-			  pixman_op_t              op,
-			  pixman_image_t *         src_image,
-			  pixman_image_t *         mask_image,
-			  pixman_image_t *         dst_image,
-			  int32_t                  src_x,
-			  int32_t                  src_y,
-			  int32_t                  mask_x,
-			  int32_t                  mask_y,
-			  int32_t                  dest_x,
-			  int32_t                  dest_y,
-			  int32_t                  width,
-			  int32_t                  height)
-{
-    uint8_t     *dst_line, *dst;
-    uint8_t     *mask_line, *mask;
-    int dst_stride, mask_stride;
-    int32_t w;
-    uint32_t src;
-    uint8_t sa;
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-    sa = (src >> 24);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-	while (w--)
-	{
-	    uint16_t tmp;
-	    uint16_t a;
-	    uint32_t m, d;
-	    uint32_t r;
-
-	    a = *mask++;
-	    d = *dst;
-
-	    m = MUL_UN8 (sa, a, tmp);
-	    r = ADD_UN8 (m, d, tmp);
-
-	    *dst++ = r;
-	}
-    }
-}
-
-#ifdef WORDS_BIGENDIAN
-#define CREATE_BITMASK(n) (0x80000000 >> (n))
-#define UPDATE_BITMASK(n) ((n) >> 1)
-#else
-#define CREATE_BITMASK(n) (1 << (n))
-#define UPDATE_BITMASK(n) ((n) << 1)
-#endif
-
-#define TEST_BIT(p, n)					\
-    (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31))
-#define SET_BIT(p, n)							\
-    do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0);
-
-static void
-fast_composite_add_1000_1000 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
-{
-    uint32_t     *dst_line, *dst;
-    uint32_t     *src_line, *src;
-    int           dst_stride, src_stride;
-    int32_t       w;
-
-    PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t,
-                           src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dst_image, 0, dest_y, uint32_t,
-                           dst_stride, dst_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	{
-	    /*
-	     * TODO: improve performance by processing uint32_t data instead
-	     *       of individual bits
-	     */
-	    if (TEST_BIT (src, src_x + w))
-		SET_BIT (dst, dest_x + w);
-	}
-    }
-}
-
-static void
-fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
-{
-    uint32_t     src, srca;
-    uint32_t    *dst, *dst_line;
-    uint32_t    *mask, *mask_line;
-    int          mask_stride, dst_stride;
-    uint32_t     bitcache, bitmask;
-    int32_t      w;
-
-    if (width <= 0)
-	return;
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-    srca = src >> 24;
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t,
-                           dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
-                           mask_stride, mask_line, 1);
-    mask_line += mask_x >> 5;
-
-    if (srca == 0xff)
-    {
-	while (height--)
-	{
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    mask = mask_line;
-	    mask_line += mask_stride;
-	    w = width;
-
-	    bitcache = *mask++;
-	    bitmask = CREATE_BITMASK (mask_x & 31);
-
-	    while (w--)
-	    {
-		if (bitmask == 0)
-		{
-		    bitcache = *mask++;
-		    bitmask = CREATE_BITMASK (0);
-		}
-		if (bitcache & bitmask)
-		    *dst = src;
-		bitmask = UPDATE_BITMASK (bitmask);
-		dst++;
-	    }
-	}
-    }
-    else
-    {
-	while (height--)
-	{
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    mask = mask_line;
-	    mask_line += mask_stride;
-	    w = width;
-
-	    bitcache = *mask++;
-	    bitmask = CREATE_BITMASK (mask_x & 31);
-
-	    while (w--)
-	    {
-		if (bitmask == 0)
-		{
-		    bitcache = *mask++;
-		    bitmask = CREATE_BITMASK (0);
-		}
-		if (bitcache & bitmask)
-		    *dst = over (src, *dst);
-		bitmask = UPDATE_BITMASK (bitmask);
-		dst++;
-	    }
-	}
-    }
-}
-
-static void
-fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
-{
-    uint32_t     src, srca;
-    uint16_t    *dst, *dst_line;
-    uint32_t    *mask, *mask_line;
-    int          mask_stride, dst_stride;
-    uint32_t     bitcache, bitmask;
-    int32_t      w;
-    uint32_t     d;
-    uint16_t     src565;
-
-    if (width <= 0)
-	return;
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-    srca = src >> 24;
-    if (src == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t,
-                           dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
-                           mask_stride, mask_line, 1);
-    mask_line += mask_x >> 5;
-
-    if (srca == 0xff)
-    {
-	src565 = CONVERT_8888_TO_0565 (src);
-	while (height--)
-	{
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    mask = mask_line;
-	    mask_line += mask_stride;
-	    w = width;
-
-	    bitcache = *mask++;
-	    bitmask = CREATE_BITMASK (mask_x & 31);
-
-	    while (w--)
-	    {
-		if (bitmask == 0)
-		{
-		    bitcache = *mask++;
-		    bitmask = CREATE_BITMASK (0);
-		}
-		if (bitcache & bitmask)
-		    *dst = src565;
-		bitmask = UPDATE_BITMASK (bitmask);
-		dst++;
-	    }
-	}
-    }
-    else
-    {
-	while (height--)
-	{
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    mask = mask_line;
-	    mask_line += mask_stride;
-	    w = width;
-
-	    bitcache = *mask++;
-	    bitmask = CREATE_BITMASK (mask_x & 31);
-
-	    while (w--)
-	    {
-		if (bitmask == 0)
-		{
-		    bitcache = *mask++;
-		    bitmask = CREATE_BITMASK (0);
-		}
-		if (bitcache & bitmask)
-		{
-		    d = over (src, CONVERT_0565_TO_0888 (*dst));
-		    *dst = CONVERT_8888_TO_0565 (d);
-		}
-		bitmask = UPDATE_BITMASK (bitmask);
-		dst++;
-	    }
-	}
-    }
-}
-
-/*
- * Simple bitblt
- */
-
-static void
-fast_composite_solid_fill (pixman_implementation_t *imp,
-                           pixman_op_t              op,
-                           pixman_image_t *         src_image,
-                           pixman_image_t *         mask_image,
-                           pixman_image_t *         dst_image,
-                           int32_t                  src_x,
-                           int32_t                  src_y,
-                           int32_t                  mask_x,
-                           int32_t                  mask_y,
-                           int32_t                  dest_x,
-                           int32_t                  dest_y,
-                           int32_t                  width,
-                           int32_t                  height)
-{
-    uint32_t src;
-
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    if (dst_image->bits.format == PIXMAN_a1)
-    {
-	src = src >> 31;
-    }
-    else if (dst_image->bits.format == PIXMAN_a8)
-    {
-	src = src >> 24;
-    }
-    else if (dst_image->bits.format == PIXMAN_r5g6b5 ||
-             dst_image->bits.format == PIXMAN_b5g6r5)
-    {
-	src = CONVERT_8888_TO_0565 (src);
-    }
-
-    pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
-                 PIXMAN_FORMAT_BPP (dst_image->bits.format),
-                 dest_x, dest_y,
-                 width, height,
-                 src);
-}
-
-static void
-fast_composite_src_memcpy (pixman_implementation_t *imp,
-			   pixman_op_t              op,
-			   pixman_image_t *         src_image,
-			   pixman_image_t *         mask_image,
-			   pixman_image_t *         dst_image,
-			   int32_t                  src_x,
-			   int32_t                  src_y,
-			   int32_t                  mask_x,
-			   int32_t                  mask_y,
-			   int32_t                  dest_x,
-			   int32_t                  dest_y,
-			   int32_t                  width,
-			   int32_t                  height)
-{
-    int bpp = PIXMAN_FORMAT_BPP (dst_image->bits.format) / 8;
-    uint32_t n_bytes = width * bpp;
-    int dst_stride, src_stride;
-    uint8_t    *dst;
-    uint8_t    *src;
-
-    src_stride = src_image->bits.rowstride * 4;
-    dst_stride = dst_image->bits.rowstride * 4;
-
-    src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
-    dst = (uint8_t *)dst_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
-
-    while (height--)
-    {
-	memcpy (dst, src, n_bytes);
-
-	dst += dst_stride;
-	src += src_stride;
-    }
-}
-
-FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER)
-FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE)
-FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD)
-FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL)
-FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER)
-FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE)
-FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD)
-FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL)
-FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER)
-FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE)
-FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD)
-FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL)
-FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL)
-FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER)
-FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE)
-FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD)
-FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL)
-
-/* Use more unrolling for src_0565_0565 because it is typically CPU bound */
-static force_inline void
-scaled_nearest_scanline_565_565_SRC (uint16_t *      dst,
-				     uint16_t *      src,
-				     int32_t         w,
-				     pixman_fixed_t  vx,
-				     pixman_fixed_t  unit_x,
-				     pixman_fixed_t  max_vx)
-{
-    uint16_t tmp1, tmp2, tmp3, tmp4;
-    while ((w -= 4) >= 0)
-    {
-	tmp1 = src[pixman_fixed_to_int (vx)];
-	vx += unit_x;
-	tmp2 = src[pixman_fixed_to_int (vx)];
-	vx += unit_x;
-	tmp3 = src[pixman_fixed_to_int (vx)];
-	vx += unit_x;
-	tmp4 = src[pixman_fixed_to_int (vx)];
-	vx += unit_x;
-	*dst++ = tmp1;
-	*dst++ = tmp2;
-	*dst++ = tmp3;
-	*dst++ = tmp4;
-    }
-    if (w & 2)
-    {
-	tmp1 = src[pixman_fixed_to_int (vx)];
-	vx += unit_x;
-	tmp2 = src[pixman_fixed_to_int (vx)];
-	vx += unit_x;
-	*dst++ = tmp1;
-	*dst++ = tmp2;
-    }
-    if (w & 1)
-	*dst++ = src[pixman_fixed_to_int (vx)];
-}
-
-FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
-		       scaled_nearest_scanline_565_565_SRC,
-		       uint16_t, uint16_t, COVER)
-FAST_NEAREST_MAINLOOP (565_565_none_SRC,
-		       scaled_nearest_scanline_565_565_SRC,
-		       uint16_t, uint16_t, NONE)
-FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
-		       scaled_nearest_scanline_565_565_SRC,
-		       uint16_t, uint16_t, PAD)
-
-static force_inline uint32_t
-fetch_nearest (pixman_repeat_t src_repeat,
-	       pixman_format_code_t format,
-	       uint32_t *src, int x, int src_width)
-{
-    if (repeat (src_repeat, &x, src_width))
-    {
-	if (format == PIXMAN_x8r8g8b8)
-	    return *(src + x) | 0xff000000;
-	else
-	    return *(src + x);
-    }
-    else
-    {
-	return 0;
-    }
-}
-
-static force_inline void
-combine_over (uint32_t s, uint32_t *dst)
-{
-    if (s)
-    {
-	uint8_t ia = 0xff - (s >> 24);
-
-	if (ia)
-	    UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s);
-	else
-	    *dst = s;
-    }
-}
-
-static force_inline void
-combine_src (uint32_t s, uint32_t *dst)
-{
-    *dst = s;
-}
-
-static void
-fast_composite_scaled_nearest (pixman_implementation_t *imp,
-			       pixman_op_t              op,
-			       pixman_image_t *         src_image,
-			       pixman_image_t *         mask_image,
-			       pixman_image_t *         dst_image,
-			       int32_t                  src_x,
-			       int32_t                  src_y,
-			       int32_t                  mask_x,
-			       int32_t                  mask_y,
-			       int32_t                  dest_x,
-			       int32_t                  dest_y,
-			       int32_t                  width,
-			       int32_t                  height)
-{
-    uint32_t       *dst_line;
-    uint32_t       *src_line;
-    int             dst_stride, src_stride;
-    int		    src_width, src_height;
-    pixman_repeat_t src_repeat;
-    pixman_fixed_t unit_x, unit_y;
-    pixman_format_code_t src_format;
-    pixman_vector_t v;
-    pixman_fixed_t vy;
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be
-     * transformed from destination space to source space
-     */
-    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1);
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (!pixman_transform_point_3d (src_image->common.transform, &v))
-	return;
-
-    unit_x = src_image->common.transform->matrix[0][0];
-    unit_y = src_image->common.transform->matrix[1][1];
-
-    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
-    v.vector[0] -= pixman_fixed_e;
-    v.vector[1] -= pixman_fixed_e;
-
-    src_height = src_image->bits.height;
-    src_width = src_image->bits.width;
-    src_repeat = src_image->common.repeat;
-    src_format = src_image->bits.format;
-
-    vy = v.vector[1];
-    while (height--)
-    {
-        pixman_fixed_t vx = v.vector[0];
-	int y = pixman_fixed_to_int (vy);
-	uint32_t *dst = dst_line;
-
-	dst_line += dst_stride;
-
-        /* adjust the y location by a unit vector in the y direction
-         * this is equivalent to transforming y+1 of the destination point to source space */
-        vy += unit_y;
-
-	if (!repeat (src_repeat, &y, src_height))
-	{
-	    if (op == PIXMAN_OP_SRC)
-		memset (dst, 0, sizeof (*dst) * width);
-	}
-	else
-	{
-	    int w = width;
-
-	    uint32_t *src = src_line + y * src_stride;
-
-	    while (w >= 2)
-	    {
-		uint32_t s1, s2;
-		int x1, x2;
-
-		x1 = pixman_fixed_to_int (vx);
-		vx += unit_x;
-
-		x2 = pixman_fixed_to_int (vx);
-		vx += unit_x;
-
-		w -= 2;
-
-		s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width);
-		s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width);
-
-		if (op == PIXMAN_OP_OVER)
-		{
-		    combine_over (s1, dst++);
-		    combine_over (s2, dst++);
-		}
-		else
-		{
-		    combine_src (s1, dst++);
-		    combine_src (s2, dst++);
-		}
-	    }
-
-	    while (w--)
-	    {
-		uint32_t s;
-		int x;
-
-		x = pixman_fixed_to_int (vx);
-		vx += unit_x;
-
-		s = fetch_nearest (src_repeat, src_format, src, x, src_width);
-
-		if (op == PIXMAN_OP_OVER)
-		    combine_over (s, dst++);
-		else
-		    combine_src (s, dst++);
-	    }
-	}
-    }
-}
-
-#define CACHE_LINE_SIZE 64
-
-#define FAST_SIMPLE_ROTATE(suffix, pix_type)                                  \
-                                                                              \
-static void                                                                   \
-blt_rotated_90_trivial_##suffix (pix_type       *dst,                         \
-				 int             dst_stride,                  \
-				 const pix_type *src,                         \
-				 int             src_stride,                  \
-				 int             w,                           \
-				 int             h)                           \
-{                                                                             \
-    int x, y;                                                                 \
-    for (y = 0; y < h; y++)                                                   \
-    {                                                                         \
-	const pix_type *s = src + (h - y - 1);                                \
-	pix_type *d = dst + dst_stride * y;                                   \
-	for (x = 0; x < w; x++)                                               \
-	{                                                                     \
-	    *d++ = *s;                                                        \
-	    s += src_stride;                                                  \
-	}                                                                     \
-    }                                                                         \
-}                                                                             \
-                                                                              \
-static void                                                                   \
-blt_rotated_270_trivial_##suffix (pix_type       *dst,                        \
-				  int             dst_stride,                 \
-				  const pix_type *src,                        \
-				  int             src_stride,                 \
-				  int             w,                          \
-				  int             h)                          \
-{                                                                             \
-    int x, y;                                                                 \
-    for (y = 0; y < h; y++)                                                   \
-    {                                                                         \
-	const pix_type *s = src + src_stride * (w - 1) + y;                   \
-	pix_type *d = dst + dst_stride * y;                                   \
-	for (x = 0; x < w; x++)                                               \
-	{                                                                     \
-	    *d++ = *s;                                                        \
-	    s -= src_stride;                                                  \
-	}                                                                     \
-    }                                                                         \
-}                                                                             \
-                                                                              \
-static void                                                                   \
-blt_rotated_90_##suffix (pix_type       *dst,                                 \
-			 int             dst_stride,                          \
-			 const pix_type *src,                                 \
-			 int             src_stride,                          \
-			 int             W,                                   \
-			 int             H)                                   \
-{                                                                             \
-    int x;                                                                    \
-    int leading_pixels = 0, trailing_pixels = 0;                              \
-    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
-                                                                              \
-    /*                                                                        \
-     * split processing into handling destination as TILE_SIZExH cache line   \
-     * aligned vertical stripes (optimistically assuming that destination     \
-     * stride is a multiple of cache line, if not - it will be just a bit     \
-     * slower)                                                                \
-     */                                                                       \
-                                                                              \
-    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
-    {                                                                         \
-	leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
-			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
-	if (leading_pixels > W)                                               \
-	    leading_pixels = W;                                               \
-                                                                              \
-	/* unaligned leading part NxH (where N < TILE_SIZE) */                \
-	blt_rotated_90_trivial_##suffix (                                     \
-	    dst,                                                              \
-	    dst_stride,                                                       \
-	    src,                                                              \
-	    src_stride,                                                       \
-	    leading_pixels,                                                   \
-	    H);                                                               \
-	                                                                      \
-	dst += leading_pixels;                                                \
-	src += leading_pixels * src_stride;                                   \
-	W -= leading_pixels;                                                  \
-    }                                                                         \
-                                                                              \
-    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
-    {                                                                         \
-	trailing_pixels = (((uintptr_t)(dst + W) &                            \
-			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
-	if (trailing_pixels > W)                                              \
-	    trailing_pixels = W;                                              \
-	W -= trailing_pixels;                                                 \
-    }                                                                         \
-                                                                              \
-    for (x = 0; x < W; x += TILE_SIZE)                                        \
-    {                                                                         \
-	/* aligned middle part TILE_SIZExH */                                 \
-	blt_rotated_90_trivial_##suffix (                                     \
-	    dst + x,                                                          \
-	    dst_stride,                                                       \
-	    src + src_stride * x,                                             \
-	    src_stride,                                                       \
-	    TILE_SIZE,                                                        \
-	    H);                                                               \
-    }                                                                         \
-                                                                              \
-    if (trailing_pixels)                                                      \
-    {                                                                         \
-	/* unaligned trailing part NxH (where N < TILE_SIZE) */               \
-	blt_rotated_90_trivial_##suffix (                                     \
-	    dst + W,                                                          \
-	    dst_stride,                                                       \
-	    src + W * src_stride,                                             \
-	    src_stride,                                                       \
-	    trailing_pixels,                                                  \
-	    H);                                                               \
-    }                                                                         \
-}                                                                             \
-                                                                              \
-static void                                                                   \
-blt_rotated_270_##suffix (pix_type       *dst,                                \
-			  int             dst_stride,                         \
-			  const pix_type *src,                                \
-			  int             src_stride,                         \
-			  int             W,                                  \
-			  int             H)                                  \
-{                                                                             \
-    int x;                                                                    \
-    int leading_pixels = 0, trailing_pixels = 0;                              \
-    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
-                                                                              \
-    /*                                                                        \
-     * split processing into handling destination as TILE_SIZExH cache line   \
-     * aligned vertical stripes (optimistically assuming that destination     \
-     * stride is a multiple of cache line, if not - it will be just a bit     \
-     * slower)                                                                \
-     */                                                                       \
-                                                                              \
-    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
-    {                                                                         \
-	leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
-			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
-	if (leading_pixels > W)                                               \
-	    leading_pixels = W;                                               \
-                                                                              \
-	/* unaligned leading part NxH (where N < TILE_SIZE) */                \
-	blt_rotated_270_trivial_##suffix (                                    \
-	    dst,                                                              \
-	    dst_stride,                                                       \
-	    src + src_stride * (W - leading_pixels),                          \
-	    src_stride,                                                       \
-	    leading_pixels,                                                   \
-	    H);                                                               \
-	                                                                      \
-	dst += leading_pixels;                                                \
-	W -= leading_pixels;                                                  \
-    }                                                                         \
-                                                                              \
-    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
-    {                                                                         \
-	trailing_pixels = (((uintptr_t)(dst + W) &                            \
-			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
-	if (trailing_pixels > W)                                              \
-	    trailing_pixels = W;                                              \
-	W -= trailing_pixels;                                                 \
-	src += trailing_pixels * src_stride;                                  \
-    }                                                                         \
-                                                                              \
-    for (x = 0; x < W; x += TILE_SIZE)                                        \
-    {                                                                         \
-	/* aligned middle part TILE_SIZExH */                                 \
-	blt_rotated_270_trivial_##suffix (                                    \
-	    dst + x,                                                          \
-	    dst_stride,                                                       \
-	    src + src_stride * (W - x - TILE_SIZE),                           \
-	    src_stride,                                                       \
-	    TILE_SIZE,                                                        \
-	    H);                                                               \
-    }                                                                         \
-                                                                              \
-    if (trailing_pixels)                                                      \
-    {                                                                         \
-	/* unaligned trailing part NxH (where N < TILE_SIZE) */               \
-	blt_rotated_270_trivial_##suffix (                                    \
-	    dst + W,                                                          \
-	    dst_stride,                                                       \
-	    src - trailing_pixels * src_stride,                               \
-	    src_stride,                                                       \
-	    trailing_pixels,                                                  \
-	    H);                                                               \
-    }                                                                         \
-}                                                                             \
-                                                                              \
-static void                                                                   \
-fast_composite_rotate_90_##suffix (pixman_implementation_t *imp,              \
-				   pixman_op_t              op,               \
-				   pixman_image_t *         src_image,        \
-				   pixman_image_t *         mask_image,       \
-				   pixman_image_t *         dst_image,        \
-				   int32_t                  src_x,            \
-				   int32_t                  src_y,            \
-				   int32_t                  mask_x,           \
-				   int32_t                  mask_y,           \
-				   int32_t                  dest_x,           \
-				   int32_t                  dest_y,           \
-				   int32_t                  width,            \
-				   int32_t                  height)           \
-{                                                                             \
-    pix_type       *dst_line;                                                 \
-    pix_type       *src_line;                                                 \
-    int             dst_stride, src_stride;                                   \
-    int             src_x_t, src_y_t;                                         \
-                                                                              \
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, pix_type,               \
-			   dst_stride, dst_line, 1);                          \
-    src_x_t = -src_y + pixman_fixed_to_int (                                  \
-				src_image->common.transform->matrix[0][2] +   \
-				pixman_fixed_1 / 2 - pixman_fixed_e) - height;\
-    src_y_t = src_x + pixman_fixed_to_int (                                   \
-				src_image->common.transform->matrix[1][2] +   \
-				pixman_fixed_1 / 2 - pixman_fixed_e);         \
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
-			   src_stride, src_line, 1);                          \
-    blt_rotated_90_##suffix (dst_line, dst_stride, src_line, src_stride,      \
-			     width, height);                                  \
-}                                                                             \
-                                                                              \
-static void                                                                   \
-fast_composite_rotate_270_##suffix (pixman_implementation_t *imp,             \
-				    pixman_op_t              op,              \
-				    pixman_image_t *         src_image,       \
-				    pixman_image_t *         mask_image,      \
-				    pixman_image_t *         dst_image,       \
-				    int32_t                  src_x,           \
-				    int32_t                  src_y,           \
-				    int32_t                  mask_x,          \
-				    int32_t                  mask_y,          \
-				    int32_t                  dest_x,          \
-				    int32_t                  dest_y,          \
-				    int32_t                  width,           \
-				    int32_t                  height)          \
-{                                                                             \
-    pix_type       *dst_line;                                                 \
-    pix_type       *src_line;                                                 \
-    int             dst_stride, src_stride;                                   \
-    int             src_x_t, src_y_t;                                         \
-                                                                              \
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, pix_type,               \
-			   dst_stride, dst_line, 1);                          \
-    src_x_t = src_y + pixman_fixed_to_int (                                   \
-				src_image->common.transform->matrix[0][2] +   \
-				pixman_fixed_1 / 2 - pixman_fixed_e);         \
-    src_y_t = -src_x + pixman_fixed_to_int (                                  \
-				src_image->common.transform->matrix[1][2] +   \
-				pixman_fixed_1 / 2 - pixman_fixed_e) - width; \
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
-			   src_stride, src_line, 1);                          \
-    blt_rotated_270_##suffix (dst_line, dst_stride, src_line, src_stride,     \
-			      width, height);                                 \
-}
-
-FAST_SIMPLE_ROTATE (8, uint8_t)
-FAST_SIMPLE_ROTATE (565, uint16_t)
-FAST_SIMPLE_ROTATE (8888, uint32_t)
-
-static const pixman_fast_path_t c_fast_paths[] =
-{
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5,   fast_composite_over_n_1_0565),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5,   fast_composite_over_n_1_0565),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca),
-    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
-    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
-    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
-    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
-    PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000),
-    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
-    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill),
-    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill),
-    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill),
-    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill),
-    PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill),
-    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill),
-    PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill),
-    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888),
-    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888),
-    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
-    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
-    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
-    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
-    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
-    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
-
-    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888),
-    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888),
-    SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888),
-    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888),
-
-    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888),
-    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888),
-
-    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565),
-    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565),
-
-    SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
-
-    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888),
-    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888),
-    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888),
-    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888),
-
-    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
-
-#define NEAREST_FAST_PATH(op,s,d)		\
-    {   PIXMAN_OP_ ## op,			\
-	PIXMAN_ ## s, SCALED_NEAREST_FLAGS,	\
-	PIXMAN_null, 0,				\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,	\
-	fast_composite_scaled_nearest,		\
-    }
-
-    NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8),
-    NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8),
-    NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8),
-    NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8),
-
-    NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8),
-    NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8),
-    NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8),
-    NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8),
-
-    NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8),
-    NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8),
-    NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8),
-    NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8),
-
-    NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8),
-    NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8),
-    NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8),
-    NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8),
-
-#define SIMPLE_ROTATE_FLAGS(angle)					  \
-    (FAST_PATH_ROTATE_ ## angle ## _TRANSFORM	|			  \
-     FAST_PATH_NEAREST_FILTER			|			  \
-     FAST_PATH_SAMPLES_COVER_CLIP		|			  \
-     FAST_PATH_STANDARD_FLAGS)
-
-#define SIMPLE_ROTATE_FAST_PATH(op,s,d,suffix)				  \
-    {   PIXMAN_OP_ ## op,						  \
-	PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (90),				  \
-	PIXMAN_null, 0,							  \
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				  \
-	fast_composite_rotate_90_##suffix,				  \
-    },									  \
-    {   PIXMAN_OP_ ## op,						  \
-	PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (270),			  \
-	PIXMAN_null, 0,							  \
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				  \
-	fast_composite_rotate_270_##suffix,				  \
-    }
-
-    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888),
-    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888),
-    SIMPLE_ROTATE_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888),
-    SIMPLE_ROTATE_FAST_PATH (SRC, r5g6b5, r5g6b5, 565),
-    SIMPLE_ROTATE_FAST_PATH (SRC, a8, a8, 8),
-
-    {   PIXMAN_OP_NONE	},
-};
-
-#ifdef WORDS_BIGENDIAN
-#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (32 - (offs) - (n)))
-#else
-#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (offs))
-#endif
-
-static force_inline void
-pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
-{
-    if (offs)
-    {
-	int leading_pixels = 32 - offs;
-	if (leading_pixels >= width)
-	{
-	    if (v)
-		*dst |= A1_FILL_MASK (width, offs);
-	    else
-		*dst &= ~A1_FILL_MASK (width, offs);
-	    return;
-	}
-	else
-	{
-	    if (v)
-		*dst++ |= A1_FILL_MASK (leading_pixels, offs);
-	    else
-		*dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
-	    width -= leading_pixels;
-	}
-    }
-    while (width >= 32)
-    {
-	if (v)
-	    *dst++ = 0xFFFFFFFF;
-	else
-	    *dst++ = 0;
-	width -= 32;
-    }
-    if (width > 0)
-    {
-	if (v)
-	    *dst |= A1_FILL_MASK (width, 0);
-	else
-	    *dst &= ~A1_FILL_MASK (width, 0);
-    }
-}
-
-static void
-pixman_fill1 (uint32_t *bits,
-              int       stride,
-              int       x,
-              int       y,
-              int       width,
-              int       height,
-              uint32_t  xor)
-{
-    uint32_t *dst = bits + y * stride + (x >> 5);
-    int offs = x & 31;
-
-    if (xor & 1)
-    {
-	while (height--)
-	{
-	    pixman_fill1_line (dst, offs, width, 1);
-	    dst += stride;
-	}
-    }
-    else
-    {
-	while (height--)
-	{
-	    pixman_fill1_line (dst, offs, width, 0);
-	    dst += stride;
-	}
-    }
-}
-
-static void
-pixman_fill8 (uint32_t *bits,
-              int       stride,
-              int       x,
-              int       y,
-              int       width,
-              int       height,
-              uint32_t xor)
-{
-    int byte_stride = stride * (int) sizeof (uint32_t);
-    uint8_t *dst = (uint8_t *) bits;
-    uint8_t v = xor & 0xff;
-    int i;
-
-    dst = dst + y * byte_stride + x;
-
-    while (height--)
-    {
-	for (i = 0; i < width; ++i)
-	    dst[i] = v;
-
-	dst += byte_stride;
-    }
-}
-
-static void
-pixman_fill16 (uint32_t *bits,
-               int       stride,
-               int       x,
-               int       y,
-               int       width,
-               int       height,
-               uint32_t xor)
-{
-    int short_stride =
-	(stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t);
-    uint16_t *dst = (uint16_t *)bits;
-    uint16_t v = xor & 0xffff;
-    int i;
-
-    dst = dst + y * short_stride + x;
-
-    while (height--)
-    {
-	for (i = 0; i < width; ++i)
-	    dst[i] = v;
-
-	dst += short_stride;
-    }
-}
-
-static void
-pixman_fill32 (uint32_t *bits,
-               int       stride,
-               int       x,
-               int       y,
-               int       width,
-               int       height,
-               uint32_t  xor)
-{
-    int i;
-
-    bits = bits + y * stride + x;
-
-    while (height--)
-    {
-	for (i = 0; i < width; ++i)
-	    bits[i] = xor;
-
-	bits += stride;
-    }
-}
-
-static pixman_bool_t
-fast_path_fill (pixman_implementation_t *imp,
-                uint32_t *               bits,
-                int                      stride,
-                int                      bpp,
-                int                      x,
-                int                      y,
-                int                      width,
-                int                      height,
-                uint32_t		 xor)
-{
-    switch (bpp)
-    {
-    case 1:
-	pixman_fill1 (bits, stride, x, y, width, height, xor);
-	break;
-
-    case 8:
-	pixman_fill8 (bits, stride, x, y, width, height, xor);
-	break;
-
-    case 16:
-	pixman_fill16 (bits, stride, x, y, width, height, xor);
-	break;
-
-    case 32:
-	pixman_fill32 (bits, stride, x, y, width, height, xor);
-	break;
-
-    default:
-	return _pixman_implementation_fill (
-	    imp->delegate, bits, stride, bpp, x, y, width, height, xor);
-	break;
-    }
-
-    return TRUE;
-}
-
-pixman_implementation_t *
-_pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
-{
-    pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
-
-    imp->fill = fast_path_fill;
-
-    return imp;
-}
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author:  Keith Packard, SuSE, Inc.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <string.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-fast-path.h"
+
+static force_inline uint32_t
+fetch_24 (uint8_t *a)
+{
+    if (((unsigned long)a) & 1)
+    {
+#ifdef WORDS_BIGENDIAN
+	return (*a << 16) | (*(uint16_t *)(a + 1));
+#else
+	return *a | (*(uint16_t *)(a + 1) << 8);
+#endif
+    }
+    else
+    {
+#ifdef WORDS_BIGENDIAN
+	return (*(uint16_t *)a << 8) | *(a + 2);
+#else
+	return *(uint16_t *)a | (*(a + 2) << 16);
+#endif
+    }
+}
+
+static force_inline void
+store_24 (uint8_t *a,
+          uint32_t v)
+{
+    if (((unsigned long)a) & 1)
+    {
+#ifdef WORDS_BIGENDIAN
+	*a = (uint8_t) (v >> 16);
+	*(uint16_t *)(a + 1) = (uint16_t) (v);
+#else
+	*a = (uint8_t) (v);
+	*(uint16_t *)(a + 1) = (uint16_t) (v >> 8);
+#endif
+    }
+    else
+    {
+#ifdef WORDS_BIGENDIAN
+	*(uint16_t *)a = (uint16_t)(v >> 8);
+	*(a + 2) = (uint8_t)v;
+#else
+	*(uint16_t *)a = (uint16_t)v;
+	*(a + 2) = (uint8_t)(v >> 16);
+#endif
+    }
+}
+
+static force_inline uint32_t
+over (uint32_t src,
+      uint32_t dest)
+{
+    uint32_t a = ~src >> 24;
+
+    UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src);
+
+    return dest;
+}
+
+static uint32_t
+in (uint32_t x,
+    uint8_t  y)
+{
+    uint16_t a = y;
+
+    UN8x4_MUL_UN8 (x, a);
+
+    return x;
+}
+
+/*
+ * Naming convention:
+ *
+ *  op_src_mask_dest
+ */
+static void
+fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 pixman_image_t *         src_image,
+                                 pixman_image_t *         mask_image,
+                                 pixman_image_t *         dst_image,
+                                 int32_t                  src_x,
+                                 int32_t                  src_y,
+                                 int32_t                  mask_x,
+                                 int32_t                  mask_y,
+                                 int32_t                  dest_x,
+                                 int32_t                  dest_y,
+                                 int32_t                  width,
+                                 int32_t                  height)
+{
+    uint32_t    *src, *src_line;
+    uint32_t    *dst, *dst_line;
+    uint8_t     *mask, *mask_line;
+    int src_stride, mask_stride, dst_stride;
+    uint8_t m;
+    uint32_t s, d;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	src = src_line;
+	src_line += src_stride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	w = width;
+	while (w--)
+	{
+	    m = *mask++;
+	    if (m)
+	    {
+		s = *src | 0xff000000;
+
+		if (m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    d = in (s, m);
+		    *dst = over (d, *dst);
+		}
+	    }
+	    src++;
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_in_n_8_8 (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         pixman_image_t *         src_image,
+                         pixman_image_t *         mask_image,
+                         pixman_image_t *         dest_image,
+                         int32_t                  src_x,
+                         int32_t                  src_y,
+                         int32_t                  mask_x,
+                         int32_t                  mask_y,
+                         int32_t                  dest_x,
+                         int32_t                  dest_y,
+                         int32_t                  width,
+                         int32_t                  height)
+{
+    uint32_t src, srca;
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint16_t t;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    if (srca == 0xff)
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    while (w--)
+	    {
+		m = *mask++;
+
+		if (m == 0)
+		    *dst = 0;
+		else if (m != 0xff)
+		    *dst = MUL_UN8 (m, *dst, t);
+
+		dst++;
+	    }
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    while (w--)
+	    {
+		m = *mask++;
+		m = MUL_UN8 (m, srca, t);
+
+		if (m == 0)
+		    *dst = 0;
+		else if (m != 0xff)
+		    *dst = MUL_UN8 (m, *dst, t);
+
+		dst++;
+	    }
+	}
+    }
+}
+
+static void
+fast_composite_in_8_8 (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       pixman_image_t *         src_image,
+                       pixman_image_t *         mask_image,
+                       pixman_image_t *         dest_image,
+                       int32_t                  src_x,
+                       int32_t                  src_y,
+                       int32_t                  mask_x,
+                       int32_t                  mask_y,
+                       int32_t                  dest_x,
+                       int32_t                  dest_y,
+                       int32_t                  width,
+                       int32_t                  height)
+{
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint8_t s;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+
+	    if (s == 0)
+		*dst = 0;
+	    else if (s != 0xff)
+		*dst = MUL_UN8 (s, *dst, t);
+
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t src, srca;
+    uint32_t    *dst_line, *dst, d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    m = *mask++;
+	    if (m == 0xff)
+	    {
+		if (srca == 0xff)
+		    *dst = src;
+		else
+		    *dst = over (src, *dst);
+	    }
+	    else if (m)
+	    {
+		d = in (src, m);
+		*dst = over (d, *dst);
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+				   pixman_op_t              op,
+				   pixman_image_t *         src_image,
+				   pixman_image_t *         mask_image,
+				   pixman_image_t *         dst_image,
+				   int32_t                  src_x,
+				   int32_t                  src_y,
+				   int32_t                  mask_x,
+				   int32_t                  mask_y,
+				   int32_t                  dest_x,
+				   int32_t                  dest_y,
+				   int32_t                  width,
+				   int32_t                  height)
+{
+    uint32_t src, s;
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    ma = *mask++;
+
+	    if (ma)
+	    {
+		d = *dst;
+		s = src;
+
+		UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d);
+
+		*dst = s;
+	    }
+
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                    pixman_op_t              op,
+                                    pixman_image_t *         src_image,
+                                    pixman_image_t *         mask_image,
+                                    pixman_image_t *         dst_image,
+                                    int32_t                  src_x,
+                                    int32_t                  src_y,
+                                    int32_t                  mask_x,
+                                    int32_t                  mask_y,
+                                    int32_t                  dest_x,
+                                    int32_t                  dest_y,
+                                    int32_t                  width,
+                                    int32_t                  height)
+{
+    uint32_t src, srca, s;
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    ma = *mask++;
+	    if (ma == 0xffffffff)
+	    {
+		if (srca == 0xff)
+		    *dst = src;
+		else
+		    *dst = over (src, *dst);
+	    }
+	    else if (ma)
+	    {
+		d = *dst;
+		s = src;
+
+		UN8x4_MUL_UN8x4 (s, ma);
+		UN8x4_MUL_UN8 (ma, srca);
+		ma = ~ma;
+		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
+
+		*dst = d;
+	    }
+
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t src, srca;
+    uint8_t     *dst_line, *dst;
+    uint32_t d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    m = *mask++;
+	    if (m == 0xff)
+	    {
+		if (srca == 0xff)
+		{
+		    d = src;
+		}
+		else
+		{
+		    d = fetch_24 (dst);
+		    d = over (src, d);
+		}
+		store_24 (dst, d);
+	    }
+	    else if (m)
+	    {
+		d = over (in (src, m), fetch_24 (dst));
+		store_24 (dst, d);
+	    }
+	    dst += 3;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t src, srca;
+    uint16_t    *dst_line, *dst;
+    uint32_t d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    m = *mask++;
+	    if (m == 0xff)
+	    {
+		if (srca == 0xff)
+		{
+		    d = src;
+		}
+		else
+		{
+		    d = *dst;
+		    d = over (src, CONVERT_0565_TO_0888 (d));
+		}
+		*dst = CONVERT_8888_TO_0565 (d);
+	    }
+	    else if (m)
+	    {
+		d = *dst;
+		d = over (in (src, m), CONVERT_0565_TO_0888 (d));
+		*dst = CONVERT_8888_TO_0565 (d);
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+                                    pixman_op_t              op,
+                                    pixman_image_t *         src_image,
+                                    pixman_image_t *         mask_image,
+                                    pixman_image_t *         dst_image,
+                                    int32_t                  src_x,
+                                    int32_t                  src_y,
+                                    int32_t                  mask_x,
+                                    int32_t                  mask_y,
+                                    int32_t                  dest_x,
+                                    int32_t                  dest_y,
+                                    int32_t                  width,
+                                    int32_t                  height)
+{
+    uint32_t  src, srca, s;
+    uint16_t  src16;
+    uint16_t *dst_line, *dst;
+    uint32_t  d;
+    uint32_t *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    src16 = CONVERT_8888_TO_0565 (src);
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    ma = *mask++;
+	    if (ma == 0xffffffff)
+	    {
+		if (srca == 0xff)
+		{
+		    *dst = src16;
+		}
+		else
+		{
+		    d = *dst;
+		    d = over (src, CONVERT_0565_TO_0888 (d));
+		    *dst = CONVERT_8888_TO_0565 (d);
+		}
+	    }
+	    else if (ma)
+	    {
+		d = *dst;
+		d = CONVERT_0565_TO_0888 (d);
+
+		s = src;
+
+		UN8x4_MUL_UN8x4 (s, ma);
+		UN8x4_MUL_UN8 (ma, srca);
+		ma = ~ma;
+		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
+
+		*dst = CONVERT_8888_TO_0565 (d);
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_8888_8888 (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               pixman_image_t *         src_image,
+                               pixman_image_t *         mask_image,
+                               pixman_image_t *         dst_image,
+                               int32_t                  src_x,
+                               int32_t                  src_y,
+                               int32_t                  mask_x,
+                               int32_t                  mask_y,
+                               int32_t                  dest_x,
+                               int32_t                  dest_y,
+                               int32_t                  width,
+                               int32_t                  height)
+{
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    uint8_t a;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    a = s >> 24;
+	    if (a == 0xff)
+		*dst = s;
+	    else if (s)
+		*dst = over (s, *dst);
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_src_x888_8888 (pixman_implementation_t *imp,
+			      pixman_op_t              op,
+			      pixman_image_t *         src_image,
+			      pixman_image_t *         mask_image,
+			      pixman_image_t *         dst_image,
+			      int32_t                  src_x,
+			      int32_t                  src_y,
+			      int32_t                  mask_x,
+			      int32_t                  mask_y,
+			      int32_t                  dest_x,
+			      int32_t                  dest_y,
+			      int32_t                  width,
+			      int32_t                  height)
+{
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	    *dst++ = (*src++) | 0xff000000;
+    }
+}
+
+#if 0
+static void
+fast_composite_over_8888_0888 (pixman_implementation_t *imp,
+			       pixman_op_t              op,
+			       pixman_image_t *         src_image,
+			       pixman_image_t *         mask_image,
+			       pixman_image_t *         dst_image,
+			       int32_t                  src_x,
+			       int32_t                  src_y,
+			       int32_t                  mask_x,
+			       int32_t                  mask_y,
+			       int32_t                  dest_x,
+			       int32_t                  dest_y,
+			       int32_t                  width,
+			       int32_t                  height)
+{
+    uint8_t     *dst_line, *dst;
+    uint32_t d;
+    uint32_t    *src_line, *src, s;
+    uint8_t a;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    a = s >> 24;
+	    if (a)
+	    {
+		if (a == 0xff)
+		    d = s;
+		else
+		    d = over (s, fetch_24 (dst));
+
+		store_24 (dst, d);
+	    }
+	    dst += 3;
+	}
+    }
+}
+#endif
+
+static void
+fast_composite_over_8888_0565 (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               pixman_image_t *         src_image,
+                               pixman_image_t *         mask_image,
+                               pixman_image_t *         dst_image,
+                               int32_t                  src_x,
+                               int32_t                  src_y,
+                               int32_t                  mask_x,
+                               int32_t                  mask_y,
+                               int32_t                  dest_x,
+                               int32_t                  dest_y,
+                               int32_t                  width,
+                               int32_t                  height)
+{
+    uint16_t    *dst_line, *dst;
+    uint32_t d;
+    uint32_t    *src_line, *src, s;
+    uint8_t a;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    a = s >> 24;
+	    if (s)
+	    {
+		if (a == 0xff)
+		{
+		    d = s;
+		}
+		else
+		{
+		    d = *dst;
+		    d = over (s, CONVERT_0565_TO_0888 (d));
+		}
+		*dst = CONVERT_8888_TO_0565 (d);
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_src_x888_0565 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    *dst = CONVERT_8888_TO_0565 (s);
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_add_8_8 (pixman_implementation_t *imp,
+			pixman_op_t              op,
+			pixman_image_t *         src_image,
+			pixman_image_t *         mask_image,
+			pixman_image_t *         dst_image,
+			int32_t                  src_x,
+			int32_t                  src_y,
+			int32_t                  mask_x,
+			int32_t                  mask_y,
+			int32_t                  dest_x,
+			int32_t                  dest_y,
+			int32_t                  width,
+			int32_t                  height)
+{
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint8_t s, d;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    if (s)
+	    {
+		if (s != 0xff)
+		{
+		    d = *dst;
+		    t = d + s;
+		    s = t | (0 - (t >> 8));
+		}
+		*dst = s;
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_add_8888_8888 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint32_t s, d;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    if (s)
+	    {
+		if (s != 0xffffffff)
+		{
+		    d = *dst;
+		    if (d)
+			UN8x4_ADD_UN8x4 (s, d);
+		}
+		*dst = s;
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_add_n_8_8 (pixman_implementation_t *imp,
+			  pixman_op_t              op,
+			  pixman_image_t *         src_image,
+			  pixman_image_t *         mask_image,
+			  pixman_image_t *         dst_image,
+			  int32_t                  src_x,
+			  int32_t                  src_y,
+			  int32_t                  mask_x,
+			  int32_t                  mask_y,
+			  int32_t                  dest_x,
+			  int32_t                  dest_y,
+			  int32_t                  width,
+			  int32_t                  height)
+{
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+    uint8_t sa;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+    sa = (src >> 24);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    uint16_t tmp;
+	    uint16_t a;
+	    uint32_t m, d;
+	    uint32_t r;
+
+	    a = *mask++;
+	    d = *dst;
+
+	    m = MUL_UN8 (sa, a, tmp);
+	    r = ADD_UN8 (m, d, tmp);
+
+	    *dst++ = r;
+	}
+    }
+}
+
+#ifdef WORDS_BIGENDIAN
+#define CREATE_BITMASK(n) (0x80000000 >> (n))
+#define UPDATE_BITMASK(n) ((n) >> 1)
+#else
+#define CREATE_BITMASK(n) (1 << (n))
+#define UPDATE_BITMASK(n) ((n) << 1)
+#endif
+
+#define TEST_BIT(p, n)					\
+    (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31))
+#define SET_BIT(p, n)							\
+    do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0);
+
+static void
+fast_composite_add_1000_1000 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t     *dst_line, *dst;
+    uint32_t     *src_line, *src;
+    int           dst_stride, src_stride;
+    int32_t       w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t,
+                           src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, 0, dest_y, uint32_t,
+                           dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    /*
+	     * TODO: improve performance by processing uint32_t data instead
+	     *       of individual bits
+	     */
+	    if (TEST_BIT (src, src_x + w))
+		SET_BIT (dst, dest_x + w);
+	}
+    }
+}
+
+static void
+fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t     src, srca;
+    uint32_t    *dst, *dst_line;
+    uint32_t    *mask, *mask_line;
+    int          mask_stride, dst_stride;
+    uint32_t     bitcache, bitmask;
+    int32_t      w;
+
+    if (width <= 0)
+	return;
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t,
+                           dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
+                           mask_stride, mask_line, 1);
+    mask_line += mask_x >> 5;
+
+    if (srca == 0xff)
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    bitcache = *mask++;
+	    bitmask = CREATE_BITMASK (mask_x & 31);
+
+	    while (w--)
+	    {
+		if (bitmask == 0)
+		{
+		    bitcache = *mask++;
+		    bitmask = CREATE_BITMASK (0);
+		}
+		if (bitcache & bitmask)
+		    *dst = src;
+		bitmask = UPDATE_BITMASK (bitmask);
+		dst++;
+	    }
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    bitcache = *mask++;
+	    bitmask = CREATE_BITMASK (mask_x & 31);
+
+	    while (w--)
+	    {
+		if (bitmask == 0)
+		{
+		    bitcache = *mask++;
+		    bitmask = CREATE_BITMASK (0);
+		}
+		if (bitcache & bitmask)
+		    *dst = over (src, *dst);
+		bitmask = UPDATE_BITMASK (bitmask);
+		dst++;
+	    }
+	}
+    }
+}
+
+static void
+fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t     src, srca;
+    uint16_t    *dst, *dst_line;
+    uint32_t    *mask, *mask_line;
+    int          mask_stride, dst_stride;
+    uint32_t     bitcache, bitmask;
+    int32_t      w;
+    uint32_t     d;
+    uint16_t     src565;
+
+    if (width <= 0)
+	return;
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t,
+                           dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
+                           mask_stride, mask_line, 1);
+    mask_line += mask_x >> 5;
+
+    if (srca == 0xff)
+    {
+	src565 = CONVERT_8888_TO_0565 (src);
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    bitcache = *mask++;
+	    bitmask = CREATE_BITMASK (mask_x & 31);
+
+	    while (w--)
+	    {
+		if (bitmask == 0)
+		{
+		    bitcache = *mask++;
+		    bitmask = CREATE_BITMASK (0);
+		}
+		if (bitcache & bitmask)
+		    *dst = src565;
+		bitmask = UPDATE_BITMASK (bitmask);
+		dst++;
+	    }
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    bitcache = *mask++;
+	    bitmask = CREATE_BITMASK (mask_x & 31);
+
+	    while (w--)
+	    {
+		if (bitmask == 0)
+		{
+		    bitcache = *mask++;
+		    bitmask = CREATE_BITMASK (0);
+		}
+		if (bitcache & bitmask)
+		{
+		    d = over (src, CONVERT_0565_TO_0888 (*dst));
+		    *dst = CONVERT_8888_TO_0565 (d);
+		}
+		bitmask = UPDATE_BITMASK (bitmask);
+		dst++;
+	    }
+	}
+    }
+}
+
+/*
+ * Simple bitblt
+ */
+
+static void
+fast_composite_solid_fill (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           pixman_image_t *         src_image,
+                           pixman_image_t *         mask_image,
+                           pixman_image_t *         dst_image,
+                           int32_t                  src_x,
+                           int32_t                  src_y,
+                           int32_t                  mask_x,
+                           int32_t                  mask_y,
+                           int32_t                  dest_x,
+                           int32_t                  dest_y,
+                           int32_t                  width,
+                           int32_t                  height)
+{
+    uint32_t src;
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    if (dst_image->bits.format == PIXMAN_a1)
+    {
+	src = src >> 31;
+    }
+    else if (dst_image->bits.format == PIXMAN_a8)
+    {
+	src = src >> 24;
+    }
+    else if (dst_image->bits.format == PIXMAN_r5g6b5 ||
+             dst_image->bits.format == PIXMAN_b5g6r5)
+    {
+	src = CONVERT_8888_TO_0565 (src);
+    }
+
+    pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
+                 PIXMAN_FORMAT_BPP (dst_image->bits.format),
+                 dest_x, dest_y,
+                 width, height,
+                 src);
+}
+
+static void
+fast_composite_src_memcpy (pixman_implementation_t *imp,
+			   pixman_op_t              op,
+			   pixman_image_t *         src_image,
+			   pixman_image_t *         mask_image,
+			   pixman_image_t *         dst_image,
+			   int32_t                  src_x,
+			   int32_t                  src_y,
+			   int32_t                  mask_x,
+			   int32_t                  mask_y,
+			   int32_t                  dest_x,
+			   int32_t                  dest_y,
+			   int32_t                  width,
+			   int32_t                  height)
+{
+    int bpp = PIXMAN_FORMAT_BPP (dst_image->bits.format) / 8;
+    uint32_t n_bytes = width * bpp;
+    int dst_stride, src_stride;
+    uint8_t    *dst;
+    uint8_t    *src;
+
+    src_stride = src_image->bits.rowstride * 4;
+    dst_stride = dst_image->bits.rowstride * 4;
+
+    src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
+    dst = (uint8_t *)dst_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
+
+    while (height--)
+    {
+	memcpy (dst, src, n_bytes);
+
+	dst += dst_stride;
+	src += src_stride;
+    }
+}
+
+FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER)
+FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE)
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD)
+FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL)
+FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER)
+FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE)
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD)
+FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL)
+FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER)
+FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE)
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD)
+FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL)
+FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL)
+FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER)
+FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE)
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD)
+FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL)
+
+/* Use more unrolling for src_0565_0565 because it is typically CPU bound */
+static force_inline void
+scaled_nearest_scanline_565_565_SRC (uint16_t *       dst,
+				     const uint16_t * src,
+				     int32_t          w,
+				     pixman_fixed_t   vx,
+				     pixman_fixed_t   unit_x,
+				     pixman_fixed_t   max_vx,
+				     pixman_bool_t    fully_transparent_src)
+{
+    uint16_t tmp1, tmp2, tmp3, tmp4;
+    while ((w -= 4) >= 0)
+    {
+	tmp1 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp2 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp3 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp4 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	*dst++ = tmp1;
+	*dst++ = tmp2;
+	*dst++ = tmp3;
+	*dst++ = tmp4;
+    }
+    if (w & 2)
+    {
+	tmp1 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp2 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	*dst++ = tmp1;
+	*dst++ = tmp2;
+    }
+    if (w & 1)
+	*dst++ = src[pixman_fixed_to_int (vx)];
+}
+
+FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
+		       scaled_nearest_scanline_565_565_SRC,
+		       uint16_t, uint16_t, COVER)
+FAST_NEAREST_MAINLOOP (565_565_none_SRC,
+		       scaled_nearest_scanline_565_565_SRC,
+		       uint16_t, uint16_t, NONE)
+FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
+		       scaled_nearest_scanline_565_565_SRC,
+		       uint16_t, uint16_t, PAD)
+
+static force_inline uint32_t
+fetch_nearest (pixman_repeat_t src_repeat,
+	       pixman_format_code_t format,
+	       uint32_t *src, int x, int src_width)
+{
+    if (repeat (src_repeat, &x, src_width))
+    {
+	if (format == PIXMAN_x8r8g8b8)
+	    return *(src + x) | 0xff000000;
+	else
+	    return *(src + x);
+    }
+    else
+    {
+	return 0;
+    }
+}
+
+static force_inline void
+combine_over (uint32_t s, uint32_t *dst)
+{
+    if (s)
+    {
+	uint8_t ia = 0xff - (s >> 24);
+
+	if (ia)
+	    UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s);
+	else
+	    *dst = s;
+    }
+}
+
+static force_inline void
+combine_src (uint32_t s, uint32_t *dst)
+{
+    *dst = s;
+}
+
+static void
+fast_composite_scaled_nearest (pixman_implementation_t *imp,
+			       pixman_op_t              op,
+			       pixman_image_t *         src_image,
+			       pixman_image_t *         mask_image,
+			       pixman_image_t *         dst_image,
+			       int32_t                  src_x,
+			       int32_t                  src_y,
+			       int32_t                  mask_x,
+			       int32_t                  mask_y,
+			       int32_t                  dest_x,
+			       int32_t                  dest_y,
+			       int32_t                  width,
+			       int32_t                  height)
+{
+    uint32_t       *dst_line;
+    uint32_t       *src_line;
+    int             dst_stride, src_stride;
+    int		    src_width, src_height;
+    pixman_repeat_t src_repeat;
+    pixman_fixed_t unit_x, unit_y;
+    pixman_format_code_t src_format;
+    pixman_vector_t v;
+    pixman_fixed_t vy;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be
+     * transformed from destination space to source space
+     */
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1);
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))
+	return;
+
+    unit_x = src_image->common.transform->matrix[0][0];
+    unit_y = src_image->common.transform->matrix[1][1];
+
+    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
+    v.vector[0] -= pixman_fixed_e;
+    v.vector[1] -= pixman_fixed_e;
+
+    src_height = src_image->bits.height;
+    src_width = src_image->bits.width;
+    src_repeat = src_image->common.repeat;
+    src_format = src_image->bits.format;
+
+    vy = v.vector[1];
+    while (height--)
+    {
+        pixman_fixed_t vx = v.vector[0];
+	int y = pixman_fixed_to_int (vy);
+	uint32_t *dst = dst_line;
+
+	dst_line += dst_stride;
+
+        /* adjust the y location by a unit vector in the y direction
+         * this is equivalent to transforming y+1 of the destination point to source space */
+        vy += unit_y;
+
+	if (!repeat (src_repeat, &y, src_height))
+	{
+	    if (op == PIXMAN_OP_SRC)
+		memset (dst, 0, sizeof (*dst) * width);
+	}
+	else
+	{
+	    int w = width;
+
+	    uint32_t *src = src_line + y * src_stride;
+
+	    while (w >= 2)
+	    {
+		uint32_t s1, s2;
+		int x1, x2;
+
+		x1 = pixman_fixed_to_int (vx);
+		vx += unit_x;
+
+		x2 = pixman_fixed_to_int (vx);
+		vx += unit_x;
+
+		w -= 2;
+
+		s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width);
+		s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width);
+
+		if (op == PIXMAN_OP_OVER)
+		{
+		    combine_over (s1, dst++);
+		    combine_over (s2, dst++);
+		}
+		else
+		{
+		    combine_src (s1, dst++);
+		    combine_src (s2, dst++);
+		}
+	    }
+
+	    while (w--)
+	    {
+		uint32_t s;
+		int x;
+
+		x = pixman_fixed_to_int (vx);
+		vx += unit_x;
+
+		s = fetch_nearest (src_repeat, src_format, src, x, src_width);
+
+		if (op == PIXMAN_OP_OVER)
+		    combine_over (s, dst++);
+		else
+		    combine_src (s, dst++);
+	    }
+	}
+    }
+}
+
+#define CACHE_LINE_SIZE 64
+
+#define FAST_SIMPLE_ROTATE(suffix, pix_type)                                  \
+                                                                              \
+static void                                                                   \
+blt_rotated_90_trivial_##suffix (pix_type       *dst,                         \
+				 int             dst_stride,                  \
+				 const pix_type *src,                         \
+				 int             src_stride,                  \
+				 int             w,                           \
+				 int             h)                           \
+{                                                                             \
+    int x, y;                                                                 \
+    for (y = 0; y < h; y++)                                                   \
+    {                                                                         \
+	const pix_type *s = src + (h - y - 1);                                \
+	pix_type *d = dst + dst_stride * y;                                   \
+	for (x = 0; x < w; x++)                                               \
+	{                                                                     \
+	    *d++ = *s;                                                        \
+	    s += src_stride;                                                  \
+	}                                                                     \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_270_trivial_##suffix (pix_type       *dst,                        \
+				  int             dst_stride,                 \
+				  const pix_type *src,                        \
+				  int             src_stride,                 \
+				  int             w,                          \
+				  int             h)                          \
+{                                                                             \
+    int x, y;                                                                 \
+    for (y = 0; y < h; y++)                                                   \
+    {                                                                         \
+	const pix_type *s = src + src_stride * (w - 1) + y;                   \
+	pix_type *d = dst + dst_stride * y;                                   \
+	for (x = 0; x < w; x++)                                               \
+	{                                                                     \
+	    *d++ = *s;                                                        \
+	    s -= src_stride;                                                  \
+	}                                                                     \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_90_##suffix (pix_type       *dst,                                 \
+			 int             dst_stride,                          \
+			 const pix_type *src,                                 \
+			 int             src_stride,                          \
+			 int             W,                                   \
+			 int             H)                                   \
+{                                                                             \
+    int x;                                                                    \
+    int leading_pixels = 0, trailing_pixels = 0;                              \
+    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
+                                                                              \
+    /*                                                                        \
+     * split processing into handling destination as TILE_SIZExH cache line   \
+     * aligned vertical stripes (optimistically assuming that destination     \
+     * stride is a multiple of cache line, if not - it will be just a bit     \
+     * slower)                                                                \
+     */                                                                       \
+                                                                              \
+    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
+    {                                                                         \
+	leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
+			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+	if (leading_pixels > W)                                               \
+	    leading_pixels = W;                                               \
+                                                                              \
+	/* unaligned leading part NxH (where N < TILE_SIZE) */                \
+	blt_rotated_90_trivial_##suffix (                                     \
+	    dst,                                                              \
+	    dst_stride,                                                       \
+	    src,                                                              \
+	    src_stride,                                                       \
+	    leading_pixels,                                                   \
+	    H);                                                               \
+	                                                                      \
+	dst += leading_pixels;                                                \
+	src += leading_pixels * src_stride;                                   \
+	W -= leading_pixels;                                                  \
+    }                                                                         \
+                                                                              \
+    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
+    {                                                                         \
+	trailing_pixels = (((uintptr_t)(dst + W) &                            \
+			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+	if (trailing_pixels > W)                                              \
+	    trailing_pixels = W;                                              \
+	W -= trailing_pixels;                                                 \
+    }                                                                         \
+                                                                              \
+    for (x = 0; x < W; x += TILE_SIZE)                                        \
+    {                                                                         \
+	/* aligned middle part TILE_SIZExH */                                 \
+	blt_rotated_90_trivial_##suffix (                                     \
+	    dst + x,                                                          \
+	    dst_stride,                                                       \
+	    src + src_stride * x,                                             \
+	    src_stride,                                                       \
+	    TILE_SIZE,                                                        \
+	    H);                                                               \
+    }                                                                         \
+                                                                              \
+    if (trailing_pixels)                                                      \
+    {                                                                         \
+	/* unaligned trailing part NxH (where N < TILE_SIZE) */               \
+	blt_rotated_90_trivial_##suffix (                                     \
+	    dst + W,                                                          \
+	    dst_stride,                                                       \
+	    src + W * src_stride,                                             \
+	    src_stride,                                                       \
+	    trailing_pixels,                                                  \
+	    H);                                                               \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_270_##suffix (pix_type       *dst,                                \
+			  int             dst_stride,                         \
+			  const pix_type *src,                                \
+			  int             src_stride,                         \
+			  int             W,                                  \
+			  int             H)                                  \
+{                                                                             \
+    int x;                                                                    \
+    int leading_pixels = 0, trailing_pixels = 0;                              \
+    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
+                                                                              \
+    /*                                                                        \
+     * split processing into handling destination as TILE_SIZExH cache line   \
+     * aligned vertical stripes (optimistically assuming that destination     \
+     * stride is a multiple of cache line, if not - it will be just a bit     \
+     * slower)                                                                \
+     */                                                                       \
+                                                                              \
+    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
+    {                                                                         \
+	leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
+			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+	if (leading_pixels > W)                                               \
+	    leading_pixels = W;                                               \
+                                                                              \
+	/* unaligned leading part NxH (where N < TILE_SIZE) */                \
+	blt_rotated_270_trivial_##suffix (                                    \
+	    dst,                                                              \
+	    dst_stride,                                                       \
+	    src + src_stride * (W - leading_pixels),                          \
+	    src_stride,                                                       \
+	    leading_pixels,                                                   \
+	    H);                                                               \
+	                                                                      \
+	dst += leading_pixels;                                                \
+	W -= leading_pixels;                                                  \
+    }                                                                         \
+                                                                              \
+    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
+    {                                                                         \
+	trailing_pixels = (((uintptr_t)(dst + W) &                            \
+			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+	if (trailing_pixels > W)                                              \
+	    trailing_pixels = W;                                              \
+	W -= trailing_pixels;                                                 \
+	src += trailing_pixels * src_stride;                                  \
+    }                                                                         \
+                                                                              \
+    for (x = 0; x < W; x += TILE_SIZE)                                        \
+    {                                                                         \
+	/* aligned middle part TILE_SIZExH */                                 \
+	blt_rotated_270_trivial_##suffix (                                    \
+	    dst + x,                                                          \
+	    dst_stride,                                                       \
+	    src + src_stride * (W - x - TILE_SIZE),                           \
+	    src_stride,                                                       \
+	    TILE_SIZE,                                                        \
+	    H);                                                               \
+    }                                                                         \
+                                                                              \
+    if (trailing_pixels)                                                      \
+    {                                                                         \
+	/* unaligned trailing part NxH (where N < TILE_SIZE) */               \
+	blt_rotated_270_trivial_##suffix (                                    \
+	    dst + W,                                                          \
+	    dst_stride,                                                       \
+	    src - trailing_pixels * src_stride,                               \
+	    src_stride,                                                       \
+	    trailing_pixels,                                                  \
+	    H);                                                               \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+fast_composite_rotate_90_##suffix (pixman_implementation_t *imp,              \
+				   pixman_op_t              op,               \
+				   pixman_image_t *         src_image,        \
+				   pixman_image_t *         mask_image,       \
+				   pixman_image_t *         dst_image,        \
+				   int32_t                  src_x,            \
+				   int32_t                  src_y,            \
+				   int32_t                  mask_x,           \
+				   int32_t                  mask_y,           \
+				   int32_t                  dest_x,           \
+				   int32_t                  dest_y,           \
+				   int32_t                  width,            \
+				   int32_t                  height)           \
+{                                                                             \
+    pix_type       *dst_line;                                                 \
+    pix_type       *src_line;                                                 \
+    int             dst_stride, src_stride;                                   \
+    int             src_x_t, src_y_t;                                         \
+                                                                              \
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, pix_type,               \
+			   dst_stride, dst_line, 1);                          \
+    src_x_t = -src_y + pixman_fixed_to_int (                                  \
+				src_image->common.transform->matrix[0][2] +   \
+				pixman_fixed_1 / 2 - pixman_fixed_e) - height;\
+    src_y_t = src_x + pixman_fixed_to_int (                                   \
+				src_image->common.transform->matrix[1][2] +   \
+				pixman_fixed_1 / 2 - pixman_fixed_e);         \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
+			   src_stride, src_line, 1);                          \
+    blt_rotated_90_##suffix (dst_line, dst_stride, src_line, src_stride,      \
+			     width, height);                                  \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+fast_composite_rotate_270_##suffix (pixman_implementation_t *imp,             \
+				    pixman_op_t              op,              \
+				    pixman_image_t *         src_image,       \
+				    pixman_image_t *         mask_image,      \
+				    pixman_image_t *         dst_image,       \
+				    int32_t                  src_x,           \
+				    int32_t                  src_y,           \
+				    int32_t                  mask_x,          \
+				    int32_t                  mask_y,          \
+				    int32_t                  dest_x,          \
+				    int32_t                  dest_y,          \
+				    int32_t                  width,           \
+				    int32_t                  height)          \
+{                                                                             \
+    pix_type       *dst_line;                                                 \
+    pix_type       *src_line;                                                 \
+    int             dst_stride, src_stride;                                   \
+    int             src_x_t, src_y_t;                                         \
+                                                                              \
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, pix_type,               \
+			   dst_stride, dst_line, 1);                          \
+    src_x_t = src_y + pixman_fixed_to_int (                                   \
+				src_image->common.transform->matrix[0][2] +   \
+				pixman_fixed_1 / 2 - pixman_fixed_e);         \
+    src_y_t = -src_x + pixman_fixed_to_int (                                  \
+				src_image->common.transform->matrix[1][2] +   \
+				pixman_fixed_1 / 2 - pixman_fixed_e) - width; \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
+			   src_stride, src_line, 1);                          \
+    blt_rotated_270_##suffix (dst_line, dst_stride, src_line, src_stride,     \
+			      width, height);                                 \
+}
+
+FAST_SIMPLE_ROTATE (8, uint8_t)
+FAST_SIMPLE_ROTATE (565, uint16_t)
+FAST_SIMPLE_ROTATE (8888, uint32_t)
+
+static const pixman_fast_path_t c_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5,   fast_composite_over_n_1_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5,   fast_composite_over_n_1_0565),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000),
+    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
+
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
+
+#define NEAREST_FAST_PATH(op,s,d)		\
+    {   PIXMAN_OP_ ## op,			\
+	PIXMAN_ ## s, SCALED_NEAREST_FLAGS,	\
+	PIXMAN_null, 0,				\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,	\
+	fast_composite_scaled_nearest,		\
+    }
+
+    NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8),
+    NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8),
+
+    NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8),
+    NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8),
+
+    NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8),
+    NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8),
+
+    NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8),
+    NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8),
+
+#define SIMPLE_ROTATE_FLAGS(angle)					  \
+    (FAST_PATH_ROTATE_ ## angle ## _TRANSFORM	|			  \
+     FAST_PATH_NEAREST_FILTER			|			  \
+     FAST_PATH_SAMPLES_COVER_CLIP		|			  \
+     FAST_PATH_STANDARD_FLAGS)
+
+#define SIMPLE_ROTATE_FAST_PATH(op,s,d,suffix)				  \
+    {   PIXMAN_OP_ ## op,						  \
+	PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (90),				  \
+	PIXMAN_null, 0,							  \
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				  \
+	fast_composite_rotate_90_##suffix,				  \
+    },									  \
+    {   PIXMAN_OP_ ## op,						  \
+	PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (270),			  \
+	PIXMAN_null, 0,							  \
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				  \
+	fast_composite_rotate_270_##suffix,				  \
+    }
+
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, r5g6b5, r5g6b5, 565),
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8, a8, 8),
+
+    {   PIXMAN_OP_NONE	},
+};
+
+#ifdef WORDS_BIGENDIAN
+#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (32 - (offs) - (n)))
+#else
+#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (offs))
+#endif
+
+static force_inline void
+pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
+{
+    if (offs)
+    {
+	int leading_pixels = 32 - offs;
+	if (leading_pixels >= width)
+	{
+	    if (v)
+		*dst |= A1_FILL_MASK (width, offs);
+	    else
+		*dst &= ~A1_FILL_MASK (width, offs);
+	    return;
+	}
+	else
+	{
+	    if (v)
+		*dst++ |= A1_FILL_MASK (leading_pixels, offs);
+	    else
+		*dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
+	    width -= leading_pixels;
+	}
+    }
+    while (width >= 32)
+    {
+	if (v)
+	    *dst++ = 0xFFFFFFFF;
+	else
+	    *dst++ = 0;
+	width -= 32;
+    }
+    if (width > 0)
+    {
+	if (v)
+	    *dst |= A1_FILL_MASK (width, 0);
+	else
+	    *dst &= ~A1_FILL_MASK (width, 0);
+    }
+}
+
+static void
+pixman_fill1 (uint32_t *bits,
+              int       stride,
+              int       x,
+              int       y,
+              int       width,
+              int       height,
+              uint32_t  xor)
+{
+    uint32_t *dst = bits + y * stride + (x >> 5);
+    int offs = x & 31;
+
+    if (xor & 1)
+    {
+	while (height--)
+	{
+	    pixman_fill1_line (dst, offs, width, 1);
+	    dst += stride;
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    pixman_fill1_line (dst, offs, width, 0);
+	    dst += stride;
+	}
+    }
+}
+
+static void
+pixman_fill8 (uint32_t *bits,
+              int       stride,
+              int       x,
+              int       y,
+              int       width,
+              int       height,
+              uint32_t xor)
+{
+    int byte_stride = stride * (int) sizeof (uint32_t);
+    uint8_t *dst = (uint8_t *) bits;
+    uint8_t v = xor & 0xff;
+    int i;
+
+    dst = dst + y * byte_stride + x;
+
+    while (height--)
+    {
+	for (i = 0; i < width; ++i)
+	    dst[i] = v;
+
+	dst += byte_stride;
+    }
+}
+
+static void
+pixman_fill16 (uint32_t *bits,
+               int       stride,
+               int       x,
+               int       y,
+               int       width,
+               int       height,
+               uint32_t xor)
+{
+    int short_stride =
+	(stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t);
+    uint16_t *dst = (uint16_t *)bits;
+    uint16_t v = xor & 0xffff;
+    int i;
+
+    dst = dst + y * short_stride + x;
+
+    while (height--)
+    {
+	for (i = 0; i < width; ++i)
+	    dst[i] = v;
+
+	dst += short_stride;
+    }
+}
+
+static void
+pixman_fill32 (uint32_t *bits,
+               int       stride,
+               int       x,
+               int       y,
+               int       width,
+               int       height,
+               uint32_t  xor)
+{
+    int i;
+
+    bits = bits + y * stride + x;
+
+    while (height--)
+    {
+	for (i = 0; i < width; ++i)
+	    bits[i] = xor;
+
+	bits += stride;
+    }
+}
+
+static pixman_bool_t
+fast_path_fill (pixman_implementation_t *imp,
+                uint32_t *               bits,
+                int                      stride,
+                int                      bpp,
+                int                      x,
+                int                      y,
+                int                      width,
+                int                      height,
+                uint32_t		 xor)
+{
+    switch (bpp)
+    {
+    case 1:
+	pixman_fill1 (bits, stride, x, y, width, height, xor);
+	break;
+
+    case 8:
+	pixman_fill8 (bits, stride, x, y, width, height, xor);
+	break;
+
+    case 16:
+	pixman_fill16 (bits, stride, x, y, width, height, xor);
+	break;
+
+    case 32:
+	pixman_fill32 (bits, stride, x, y, width, height, xor);
+	break;
+
+    default:
+	return _pixman_implementation_fill (
+	    imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+	break;
+    }
+
+    return TRUE;
+}
+
+pixman_implementation_t *
+_pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
+
+    imp->fill = fast_path_fill;
+
+    return imp;
+}
diff --git a/pixman/pixman/pixman-fast-path.h b/pixman/pixman/pixman-fast-path.h
index bb7032d86..d08122293 100644
--- a/pixman/pixman/pixman-fast-path.h
+++ b/pixman/pixman/pixman-fast-path.h
@@ -1,449 +1,590 @@
-/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission.  SuSE makes no representations about the
- * suitability of this software for any purpose.  It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * Author:  Keith Packard, SuSE, Inc.
- */
-
-#ifndef PIXMAN_FAST_PATH_H__
-#define PIXMAN_FAST_PATH_H__
-
-#include "pixman-private.h"
-
-#define PIXMAN_REPEAT_COVER -1
-
-static force_inline pixman_bool_t
-repeat (pixman_repeat_t repeat, int *c, int size)
-{
-    if (repeat == PIXMAN_REPEAT_NONE)
-    {
-	if (*c < 0 || *c >= size)
-	    return FALSE;
-    }
-    else if (repeat == PIXMAN_REPEAT_NORMAL)
-    {
-	while (*c >= size)
-	    *c -= size;
-	while (*c < 0)
-	    *c += size;
-    }
-    else if (repeat == PIXMAN_REPEAT_PAD)
-    {
-	*c = CLIP (*c, 0, size - 1);
-    }
-    else /* REFLECT */
-    {
-	*c = MOD (*c, size * 2);
-	if (*c >= size)
-	    *c = size * 2 - *c - 1;
-    }
-    return TRUE;
-}
-
-/*
- * For each scanline fetched from source image with PAD repeat:
- * - calculate how many pixels need to be padded on the left side
- * - calculate how many pixels need to be padded on the right side
- * - update width to only count pixels which are fetched from the image
- * All this information is returned via 'width', 'left_pad', 'right_pad'
- * arguments. The code is assuming that 'unit_x' is positive.
- *
- * Note: 64-bit math is used in order to avoid potential overflows, which
- *       is probably excessive in many cases. This particular function
- *       may need its own correctness test and performance tuning.
- */
-static force_inline void
-pad_repeat_get_scanline_bounds (int32_t         source_image_width,
-				pixman_fixed_t  vx,
-				pixman_fixed_t  unit_x,
-				int32_t *       width,
-				int32_t *       left_pad,
-				int32_t *       right_pad)
-{
-    int64_t max_vx = (int64_t) source_image_width << 16;
-    int64_t tmp;
-    if (vx < 0)
-    {
-	tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
-	if (tmp > *width)
-	{
-	    *left_pad = *width;
-	    *width = 0;
-	}
-	else
-	{
-	    *left_pad = (int32_t) tmp;
-	    *width -= (int32_t) tmp;
-	}
-    }
-    else
-    {
-	*left_pad = 0;
-    }
-    tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
-    if (tmp < 0)
-    {
-	*right_pad = *width;
-	*width = 0;
-    }
-    else if (tmp >= *width)
-    {
-	*right_pad = 0;
-    }
-    else
-    {
-	*right_pad = *width - (int32_t) tmp;
-	*width = (int32_t) tmp;
-    }
-}
-
-/* A macroified version of specialized nearest scalers for some
- * common 8888 and 565 formats. It supports SRC and OVER ops.
- *
- * There are two repeat versions, one that handles repeat normal,
- * and one without repeat handling that only works if the src region
- * used is completely covered by the pre-repeated source samples.
- *
- * The loops are unrolled to process two pixels per iteration for better
- * performance on most CPU architectures (superscalar processors
- * can issue several operations simultaneously, other processors can hide
- * instructions latencies by pipelining operations). Unrolling more
- * does not make much sense because the compiler will start running out
- * of spare registers soon.
- */
-
-#define GET_8888_ALPHA(s) ((s) >> 24)
- /* This is not actually used since we don't have an OVER with
-    565 source, but it is needed to build. */
-#define GET_0565_ALPHA(s) 0xff
-
-#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT,			\
-			      src_type_t, dst_type_t, OP, repeat_mode)				\
-static force_inline void									\
-scanline_func_name (dst_type_t     *dst,							\
-		    src_type_t     *src,							\
-		    int32_t         w,								\
-		    pixman_fixed_t  vx,								\
-		    pixman_fixed_t  unit_x,							\
-		    pixman_fixed_t  max_vx)							\
-{												\
-	uint32_t   d;										\
-	src_type_t s1, s2;									\
-	uint8_t    a1, a2;									\
-	int        x1, x2;									\
-												\
-	if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
-	    abort();										\
-												\
-	while ((w -= 2) >= 0)									\
-	{											\
-	    x1 = vx >> 16;									\
-	    vx += unit_x;									\
-	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
-	    {											\
-		/* This works because we know that unit_x is positive */			\
-		while (vx >= max_vx)								\
-		    vx -= max_vx;								\
-	    }											\
-	    s1 = src[x1];									\
-												\
-	    x2 = vx >> 16;									\
-	    vx += unit_x;									\
-	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
-	    {											\
-		/* This works because we know that unit_x is positive */			\
-		while (vx >= max_vx)								\
-		    vx -= max_vx;								\
-	    }											\
-	    s2 = src[x2];									\
-												\
-	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
-	    {											\
-		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
-		a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);						\
-												\
-		if (a1 == 0xff)									\
-		{										\
-		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-		}										\
-		else if (s1)									\
-		{										\
-		    d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst);				\
-		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
-		    a1 ^= 0xff;									\
-		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
-		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
-		}										\
-		dst++;										\
-												\
-		if (a2 == 0xff)									\
-		{										\
-		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
-		}										\
-		else if (s2)									\
-		{										\
-		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
-		    s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2);				\
-		    a2 ^= 0xff;									\
-		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);					\
-		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
-		}										\
-		dst++;										\
-	    }											\
-	    else /* PIXMAN_OP_SRC */								\
-	    {											\
-		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
-	    }											\
-	}											\
-												\
-	if (w & 1)										\
-	{											\
-	    x1 = vx >> 16;									\
-	    s1 = src[x1];									\
-												\
-	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
-	    {											\
-		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
-												\
-		if (a1 == 0xff)									\
-		{										\
-		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-		}										\
-		else if (s1)									\
-		{										\
-		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
-		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
-		    a1 ^= 0xff;									\
-		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
-		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
-		}										\
-		dst++;										\
-	    }											\
-	    else /* PIXMAN_OP_SRC */								\
-	    {											\
-		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-	    }											\
-	}											\
-}
-
-#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, dst_type_t,	\
-				  repeat_mode)							\
-static void											\
-fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,		\
-						   pixman_op_t              op,			\
-						   pixman_image_t *         src_image,		\
-						   pixman_image_t *         mask_image,		\
-						   pixman_image_t *         dst_image,		\
-						   int32_t                  src_x,		\
-						   int32_t                  src_y,		\
-						   int32_t                  mask_x,		\
-						   int32_t                  mask_y,		\
-						   int32_t                  dst_x,		\
-						   int32_t                  dst_y,		\
-						   int32_t                  width,		\
-						   int32_t                  height)		\
-{												\
-    dst_type_t *dst_line;									\
-    src_type_t *src_first_line;									\
-    int       y;										\
-    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\
-    pixman_fixed_t max_vy;									\
-    pixman_vector_t v;										\
-    pixman_fixed_t vx, vy;									\
-    pixman_fixed_t unit_x, unit_y;								\
-    int32_t left_pad, right_pad;								\
-												\
-    src_type_t *src;										\
-    dst_type_t *dst;										\
-    int       src_stride, dst_stride;								\
-												\
-    PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1);	\
-    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
-     * transformed from destination space to source space */					\
-    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
-												\
-    /* reference point is the center of the pixel */						\
-    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
-    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
-    v.vector[2] = pixman_fixed_1;								\
-												\
-    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
-	return;											\
-												\
-    unit_x = src_image->common.transform->matrix[0][0];						\
-    unit_y = src_image->common.transform->matrix[1][1];						\
-												\
-    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\
-    v.vector[0] -= pixman_fixed_e;								\
-    v.vector[1] -= pixman_fixed_e;								\
-												\
-    vx = v.vector[0];										\
-    vy = v.vector[1];										\
-												\
-    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
-    {												\
-	/* Clamp repeating positions inside the actual samples */				\
-	max_vx = src_image->bits.width << 16;							\
-	max_vy = src_image->bits.height << 16;							\
-												\
-	repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);						\
-	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
-    }												\
-												\
-    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
-	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
-    {												\
-	pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x,			\
-					&width, &left_pad, &right_pad);				\
-	vx += left_pad * unit_x;								\
-    }												\
-												\
-    while (--height >= 0)									\
-    {												\
-	dst = dst_line;										\
-	dst_line += dst_stride;									\
-												\
-	y = vy >> 16;										\
-	vy += unit_y;										\
-	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
-	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
-	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
-	{											\
-	    repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height);				\
-	    src = src_first_line + src_stride * y;						\
-	    if (left_pad > 0)									\
-	    {											\
-		scanline_func (dst, src, left_pad, 0, 0, 0);					\
-	    }											\
-	    if (width > 0)									\
-	    {											\
-		scanline_func (dst + left_pad, src, width, vx, unit_x, 0);			\
-	    }											\
-	    if (right_pad > 0)									\
-	    {											\
-		scanline_func (dst + left_pad + width, src + src_image->bits.width - 1,		\
-			        right_pad, 0, 0, 0);						\
-	    }											\
-	}											\
-	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
-	{											\
-	    static src_type_t zero[1] = { 0 };							\
-	    if (y < 0 || y >= src_image->bits.height)						\
-	    {											\
-		scanline_func (dst, zero, left_pad + width + right_pad, 0, 0, 0);		\
-		continue;									\
-	    }											\
-	    src = src_first_line + src_stride * y;						\
-	    if (left_pad > 0)									\
-	    {											\
-		scanline_func (dst, zero, left_pad, 0, 0, 0);					\
-	    }											\
-	    if (width > 0)									\
-	    {											\
-		scanline_func (dst + left_pad, src, width, vx, unit_x, 0);			\
-	    }											\
-	    if (right_pad > 0)									\
-	    {											\
-		scanline_func (dst + left_pad + width, zero, right_pad, 0, 0, 0);		\
-	    }											\
-	}											\
-	else											\
-	{											\
-	    src = src_first_line + src_stride * y;						\
-	    scanline_func (dst, src, width, vx, unit_x, max_vx);				\
-	}											\
-    }												\
-}
-
-/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
-#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t,		\
-			      repeat_mode)							\
-	FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, dst_type_t,	\
-			      repeat_mode)							\
-
-#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,				\
-		     src_type_t, dst_type_t, OP, repeat_mode)				\
-    FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
-			  SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t,		\
-			  OP, repeat_mode)						\
-    FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name ## _ ## OP,				\
-			  scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
-			  src_type_t, dst_type_t, repeat_mode)
-
-
-#define SCALED_NEAREST_FLAGS						\
-    (FAST_PATH_SCALE_TRANSFORM	|					\
-     FAST_PATH_NO_ALPHA_MAP	|					\
-     FAST_PATH_NEAREST_FILTER	|					\
-     FAST_PATH_NO_ACCESSORS	|					\
-     FAST_PATH_NARROW_FORMAT)
-
-#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_NORMAL_REPEAT	|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_PAD_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_NONE_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
-    }
-
-/* Prefer the use of 'cover' variant, because it is faster */
-#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
-    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),			\
-    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),			\
-    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),				\
-    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
-
-#endif
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author:  Keith Packard, SuSE, Inc.
+ */
+
+#ifndef PIXMAN_FAST_PATH_H__
+#define PIXMAN_FAST_PATH_H__
+
+#include "pixman-private.h"
+
+#define PIXMAN_REPEAT_COVER -1
+
+static force_inline pixman_bool_t
+repeat (pixman_repeat_t repeat, int *c, int size)
+{
+    if (repeat == PIXMAN_REPEAT_NONE)
+    {
+	if (*c < 0 || *c >= size)
+	    return FALSE;
+    }
+    else if (repeat == PIXMAN_REPEAT_NORMAL)
+    {
+	while (*c >= size)
+	    *c -= size;
+	while (*c < 0)
+	    *c += size;
+    }
+    else if (repeat == PIXMAN_REPEAT_PAD)
+    {
+	*c = CLIP (*c, 0, size - 1);
+    }
+    else /* REFLECT */
+    {
+	*c = MOD (*c, size * 2);
+	if (*c >= size)
+	    *c = size * 2 - *c - 1;
+    }
+    return TRUE;
+}
+
+/*
+ * For each scanline fetched from source image with PAD repeat:
+ * - calculate how many pixels need to be padded on the left side
+ * - calculate how many pixels need to be padded on the right side
+ * - update width to only count pixels which are fetched from the image
+ * All this information is returned via 'width', 'left_pad', 'right_pad'
+ * arguments. The code is assuming that 'unit_x' is positive.
+ *
+ * Note: 64-bit math is used in order to avoid potential overflows, which
+ *       is probably excessive in many cases. This particular function
+ *       may need its own correctness test and performance tuning.
+ */
+static force_inline void
+pad_repeat_get_scanline_bounds (int32_t         source_image_width,
+				pixman_fixed_t  vx,
+				pixman_fixed_t  unit_x,
+				int32_t *       width,
+				int32_t *       left_pad,
+				int32_t *       right_pad)
+{
+    int64_t max_vx = (int64_t) source_image_width << 16;
+    int64_t tmp;
+    if (vx < 0)
+    {
+	tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
+	if (tmp > *width)
+	{
+	    *left_pad = *width;
+	    *width = 0;
+	}
+	else
+	{
+	    *left_pad = (int32_t) tmp;
+	    *width -= (int32_t) tmp;
+	}
+    }
+    else
+    {
+	*left_pad = 0;
+    }
+    tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
+    if (tmp < 0)
+    {
+	*right_pad = *width;
+	*width = 0;
+    }
+    else if (tmp >= *width)
+    {
+	*right_pad = 0;
+    }
+    else
+    {
+	*right_pad = *width - (int32_t) tmp;
+	*width = (int32_t) tmp;
+    }
+}
+
+/* A macroified version of specialized nearest scalers for some
+ * common 8888 and 565 formats. It supports SRC and OVER ops.
+ *
+ * There are two repeat versions, one that handles repeat normal,
+ * and one without repeat handling that only works if the src region
+ * used is completely covered by the pre-repeated source samples.
+ *
+ * The loops are unrolled to process two pixels per iteration for better
+ * performance on most CPU architectures (superscalar processors
+ * can issue several operations simultaneously, other processors can hide
+ * instructions latencies by pipelining operations). Unrolling more
+ * does not make much sense because the compiler will start running out
+ * of spare registers soon.
+ */
+
+#define GET_8888_ALPHA(s) ((s) >> 24)
+ /* This is not actually used since we don't have an OVER with
+    565 source, but it is needed to build. */
+#define GET_0565_ALPHA(s) 0xff
+
+#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT,			\
+			      src_type_t, dst_type_t, OP, repeat_mode)				\
+static force_inline void									\
+scanline_func_name (dst_type_t       *dst,							\
+		    const src_type_t *src,							\
+		    int32_t           w,							\
+		    pixman_fixed_t    vx,							\
+		    pixman_fixed_t    unit_x,							\
+		    pixman_fixed_t    max_vx,							\
+		    pixman_bool_t     fully_transparent_src)					\
+{												\
+	uint32_t   d;										\
+	src_type_t s1, s2;									\
+	uint8_t    a1, a2;									\
+	int        x1, x2;									\
+												\
+	if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER && fully_transparent_src)			\
+	    return;										\
+												\
+	if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
+	    abort();										\
+												\
+	while ((w -= 2) >= 0)									\
+	{											\
+	    x1 = vx >> 16;									\
+	    vx += unit_x;									\
+	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    {											\
+		/* This works because we know that unit_x is positive */			\
+		while (vx >= max_vx)								\
+		    vx -= max_vx;								\
+	    }											\
+	    s1 = src[x1];									\
+												\
+	    x2 = vx >> 16;									\
+	    vx += unit_x;									\
+	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    {											\
+		/* This works because we know that unit_x is positive */			\
+		while (vx >= max_vx)								\
+		    vx -= max_vx;								\
+	    }											\
+	    s2 = src[x2];									\
+												\
+	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
+	    {											\
+		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
+		a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);						\
+												\
+		if (a1 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		}										\
+		else if (s1)									\
+		{										\
+		    d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst);				\
+		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
+		    a1 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+												\
+		if (a2 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
+		}										\
+		else if (s2)									\
+		{										\
+		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
+		    s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2);				\
+		    a2 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+	    }											\
+	    else /* PIXMAN_OP_SRC */								\
+	    {											\
+		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
+	    }											\
+	}											\
+												\
+	if (w & 1)										\
+	{											\
+	    x1 = vx >> 16;									\
+	    s1 = src[x1];									\
+												\
+	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
+	    {											\
+		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
+												\
+		if (a1 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		}										\
+		else if (s1)									\
+		{										\
+		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
+		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
+		    a1 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+	    }											\
+	    else /* PIXMAN_OP_SRC */								\
+	    {											\
+		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+	    }											\
+	}											\
+}
+
+#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
+static void											\
+fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,		\
+						   pixman_op_t              op,			\
+						   pixman_image_t *         src_image,		\
+						   pixman_image_t *         mask_image,		\
+						   pixman_image_t *         dst_image,		\
+						   int32_t                  src_x,		\
+						   int32_t                  src_y,		\
+						   int32_t                  mask_x,		\
+						   int32_t                  mask_y,		\
+						   int32_t                  dst_x,		\
+						   int32_t                  dst_y,		\
+						   int32_t                  width,		\
+						   int32_t                  height)		\
+{												\
+    dst_type_t *dst_line;									\
+    mask_type_t *mask_line;									\
+    src_type_t *src_first_line;									\
+    int       y;										\
+    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\
+    pixman_fixed_t max_vy;									\
+    pixman_vector_t v;										\
+    pixman_fixed_t vx, vy;									\
+    pixman_fixed_t unit_x, unit_y;								\
+    int32_t left_pad, right_pad;								\
+												\
+    src_type_t *src;										\
+    dst_type_t *dst;										\
+    mask_type_t solid_mask;									\
+    const mask_type_t *mask = &solid_mask;							\
+    int src_stride, mask_stride, dst_stride;							\
+												\
+    PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1);	\
+    if (have_mask)										\
+    {												\
+	if (mask_is_solid)									\
+	    solid_mask = _pixman_image_get_solid (imp, mask_image, dst_image->bits.format);	\
+	else											\
+	    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,			\
+				   mask_stride, mask_line, 1);					\
+    }												\
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
+     * transformed from destination space to source space */					\
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
+												\
+    /* reference point is the center of the pixel */						\
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
+    v.vector[2] = pixman_fixed_1;								\
+												\
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
+	return;											\
+												\
+    unit_x = src_image->common.transform->matrix[0][0];						\
+    unit_y = src_image->common.transform->matrix[1][1];						\
+												\
+    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\
+    v.vector[0] -= pixman_fixed_e;								\
+    v.vector[1] -= pixman_fixed_e;								\
+												\
+    vx = v.vector[0];										\
+    vy = v.vector[1];										\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
+    {												\
+	/* Clamp repeating positions inside the actual samples */				\
+	max_vx = src_image->bits.width << 16;							\
+	max_vy = src_image->bits.height << 16;							\
+												\
+	repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);						\
+	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+    }												\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
+	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
+    {												\
+	pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x,			\
+					&width, &left_pad, &right_pad);				\
+	vx += left_pad * unit_x;								\
+    }												\
+												\
+    while (--height >= 0)									\
+    {												\
+	dst = dst_line;										\
+	dst_line += dst_stride;									\
+	if (have_mask && !mask_is_solid)							\
+	{											\
+	    mask = mask_line;									\
+	    mask_line += mask_stride;								\
+	}											\
+												\
+	y = vy >> 16;										\
+	vy += unit_y;										\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+	{											\
+	    repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height);				\
+	    src = src_first_line + src_stride * y;						\
+	    if (left_pad > 0)									\
+	    {											\
+		scanline_func (mask, dst, src, left_pad, 0, 0, 0, FALSE);			\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
+			       dst + left_pad, src, width, vx, unit_x, 0, FALSE);		\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
+			       dst + left_pad + width, src + src_image->bits.width - 1,		\
+			       right_pad, 0, 0, 0, FALSE);					\
+	    }											\
+	}											\
+	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
+	{											\
+	    static const src_type_t zero[1] = { 0 };						\
+	    if (y < 0 || y >= src_image->bits.height)						\
+	    {											\
+		scanline_func (mask, dst, zero, left_pad + width + right_pad, 0, 0, 0, TRUE);	\
+		continue;									\
+	    }											\
+	    src = src_first_line + src_stride * y;						\
+	    if (left_pad > 0)									\
+	    {											\
+		scanline_func (mask, dst, zero, left_pad, 0, 0, 0, TRUE);			\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
+			       dst + left_pad, src, width, vx, unit_x, 0, FALSE);		\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
+			       dst + left_pad + width, zero, right_pad, 0, 0, 0, TRUE);		\
+	    }											\
+	}											\
+	else											\
+	{											\
+	    src = src_first_line + src_stride * y;						\
+	    scanline_func (mask, dst, src, width, vx, unit_x, max_vx, FALSE);			\
+	}											\
+    }												\
+}
+
+/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
+#define FAST_NEAREST_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
+	FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)
+
+#define FAST_NEAREST_MAINLOOP_NOMASK(scale_func_name, scanline_func, src_type_t, dst_type_t,	\
+			      repeat_mode)							\
+    static force_inline void									\
+    scanline_func##scale_func_name##_wrapper (							\
+		    const uint8_t    *mask,							\
+		    dst_type_t       *dst,							\
+		    const src_type_t *src,							\
+		    int32_t          w,								\
+		    pixman_fixed_t   vx,							\
+		    pixman_fixed_t   unit_x,							\
+		    pixman_fixed_t   max_vx,							\
+		    pixman_bool_t    fully_transparent_src)					\
+    {												\
+	scanline_func (dst, src, w, vx, unit_x, max_vx, fully_transparent_src);			\
+    }												\
+    FAST_NEAREST_MAINLOOP_INT (scale_func_name, scanline_func##scale_func_name##_wrapper,	\
+			       src_type_t, uint8_t, dst_type_t, repeat_mode, FALSE, FALSE)
+
+#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t,		\
+			      repeat_mode)							\
+	FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name, scanline_func, src_type_t,		\
+			      dst_type_t, repeat_mode)
+
+#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,				\
+		     src_type_t, dst_type_t, OP, repeat_mode)				\
+    FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
+			  SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t,		\
+			  OP, repeat_mode)						\
+    FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name ## _ ## OP,			\
+			  scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
+			  src_type_t, dst_type_t, repeat_mode)
+
+
+#define SCALED_NEAREST_FLAGS						\
+    (FAST_PATH_SCALE_TRANSFORM	|					\
+     FAST_PATH_NO_ALPHA_MAP	|					\
+     FAST_PATH_NEAREST_FILTER	|					\
+     FAST_PATH_NO_ACCESSORS	|					\
+     FAST_PATH_NARROW_FORMAT)
+
+#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
+    }
+
+/* Prefer the use of 'cover' variant, because it is faster */
+#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
+    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),			\
+    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),			\
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),				\
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)			\
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH(op,s,d,func)		\
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func)
+
+#endif
diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c
index 91adc0560..2e135e2fe 100644
--- a/pixman/pixman/pixman-sse2.c
+++ b/pixman/pixman/pixman-sse2.c
@@ -5800,7 +5800,8 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
                                              int32_t         w,
                                              pixman_fixed_t  vx,
                                              pixman_fixed_t  unit_x,
-                                             pixman_fixed_t  max_vx)
+                                             pixman_fixed_t  max_vx,
+                                             pixman_bool_t   fully_transparent_src)
 {
     uint32_t s, d;
     const uint32_t* pm = NULL;
@@ -5809,6 +5810,9 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_alpha_lo, xmm_alpha_hi;
 
+    if (fully_transparent_src)
+	return;
+
     /* Align dst on a 16-byte boundary */
     while (w && ((unsigned long)pd & 15))
     {
@@ -5894,6 +5898,119 @@ FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
 		       uint32_t, uint32_t, PAD)
 
+static force_inline void
+scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
+					       uint32_t *       dst,
+					       const uint32_t * src,
+					       int32_t          w,
+					       pixman_fixed_t   vx,
+					       pixman_fixed_t   unit_x,
+					       pixman_fixed_t   max_vx,
+					       pixman_bool_t    zero_src)
+{
+    __m128i xmm_mask;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    if (zero_src || (*mask >> 24) == 0)
+	return;
+
+    xmm_mask = create_mask_16_128 (*mask >> 24);
+
+    while (w && (unsigned long)dst & 15)
+    {
+	uint32_t s = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+
+	if (s)
+	{
+	    uint32_t d = *dst;
+
+	    __m64 ms = unpack_32_1x64 (s);
+	    __m64 alpha     = expand_alpha_1x64 (ms);
+	    __m64 dest      = _mm_movepi64_pi64 (xmm_mask);
+	    __m64 alpha_dst = unpack_32_1x64 (d);
+
+	    *dst = pack_1x64_32 (
+		in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
+	}
+	dst++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	uint32_t tmp1, tmp2, tmp3, tmp4;
+
+	tmp1 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp2 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp3 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp4 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+
+	xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+	if (!is_zero (xmm_src))
+	{
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			        &xmm_alpha_lo, &xmm_alpha_hi);
+
+	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			   &xmm_alpha_lo, &xmm_alpha_hi,
+			   &xmm_mask, &xmm_mask,
+			   &xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	}
+
+	dst += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	uint32_t s = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+
+	if (s)
+	{
+	    uint32_t d = *dst;
+
+	    __m64 ms = unpack_32_1x64 (s);
+	    __m64 alpha = expand_alpha_1x64 (ms);
+	    __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
+	    __m64 dest  = unpack_32_1x64 (d);
+
+	    *dst = pack_1x64_32 (
+		in_over_1x64 (&ms, &alpha, &mask, &dest));
+	}
+
+	dst++;
+	w--;
+    }
+
+    _mm_empty ();
+}
+
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+
 static const pixman_fast_path_t sse2_fast_paths[] =
 {
     /* PIXMAN_OP_OVER */
@@ -5990,6 +6107,11 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
 
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+
     { PIXMAN_OP_NONE },
 };
 
diff --git a/pixman/test/Makefile.am b/pixman/test/Makefile.am
index 8d8471d1c..3ce466eec 100644
--- a/pixman/test/Makefile.am
+++ b/pixman/test/Makefile.am
@@ -1,7 +1,6 @@
 AM_CFLAGS = @OPENMP_CFLAGS@
-AM_LDFLAGS = @OPENMP_CFLAGS@
-
-TEST_LDADD = $(top_builddir)/pixman/libpixman-1.la -lm
+AM_LDFLAGS = @OPENMP_CFLAGS@ @TESTPROGS_EXTRA_LDFLAGS@
+LDADD = $(top_builddir)/pixman/libpixman-1.la -lm
 INCLUDES = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman
 
 TESTPROGRAMS =			\
@@ -22,121 +21,24 @@ TESTPROGRAMS =			\
 	affine-test		\
 	composite
 
-a1_trap_test_LDADD = $(TEST_LDADD)
-a1_trap_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
-
-fetch_test_LDADD = $(TEST_LDADD)
-fetch_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
-
-trap_crasher_LDADD = $(TEST_LDADD)
-trap_crasher_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
-
-oob_test_LDADD = $(TEST_LDADD)
-oob_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
-
-scaling_crash_test_LDADD = $(TEST_LDADD)
-scaling_crash_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
-
-region_translate_test_LDADD = $(TEST_LDADD)
-region_translate_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
-
-pdf_op_test_LDADD = $(TEST_LDADD)
-pdf_op_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
 pdf_op_test_SOURCES = pdf-op-test.c utils.c utils.h
-
-region_test_LDADD = $(TEST_LDADD)
-region_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
 region_test_SOURCES = region-test.c utils.c utils.h
-
-blitters_test_LDADD = $(TEST_LDADD)
-blitters_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
 blitters_test_SOURCES = blitters-test.c utils.c utils.h
-
-scaling_test_LDADD = $(TEST_LDADD)
-scaling_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
 scaling_test_SOURCES = scaling-test.c utils.c utils.h
-
-affine_test_LDADD = $(TEST_LDADD)
-affine_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
 affine_test_SOURCES = affine-test.c utils.c utils.h
-
-alphamap_LDADD = $(TEST_LDADD)
-alphamap_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
 alphamap_SOURCES = alphamap.c utils.c utils.h
-
-alpha_loop_LDADD = $(TEST_LDADD)
-alpha_loop_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
 alpha_loop_SOURCES = alpha-loop.c utils.c utils.h
-
-composite_LDADD = $(TEST_LDADD)
-composite_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
 composite_SOURCES = composite.c utils.c utils.h
-
-gradient_crash_test_LDADD = $(TEST_LDADD)
-gradient_crash_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
 gradient_crash_test_SOURCES = gradient-crash-test.c utils.c utils.h
-
-stress_test_LDADD = $(TEST_LDADD)
-stress_test_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
 stress_test_SOURCES = stress-test.c utils.c utils.h
 
-# GTK using test programs
-
-if HAVE_GTK
-
-GTK_LDADD = $(TEST_LDADD) $(GTK_LIBS)
-GTK_UTILS = gtk-utils.c gtk-utils.h
-
-TESTPROGRAMS_GTK =		\
-	clip-test		\
-	clip-in			\
-	composite-test		\
-	gradient-test		\
-	radial-test		\
-	alpha-test		\
-	screen-test		\
-	convolution-test	\
-	trap-test
-
-INCLUDES += $(GTK_CFLAGS)
-
-gradient_test_LDADD = $(GTK_LDADD)
-gradient_test_SOURCES = gradient-test.c $(GTK_UTILS)
-
-radial_test_LDADD = $(GTK_LDADD)
-radial_test_SOURCES = radial-test.c utils.c utils.h $(GTK_UTILS)
-
-alpha_test_LDADD = $(GTK_LDADD)
-alpha_test_SOURCES = alpha-test.c $(GTK_UTILS)
-
-composite_test_LDADD = $(GTK_LDADD)
-composite_test_SOURCES = composite-test.c $(GTK_UTILS)
-
-clip_test_LDADD = $(GTK_LDADD)
-clip_test_SOURCES = clip-test.c $(GTK_UTILS)
-
-clip_in_LDADD = $(GTK_LDADD)
-clip_in_SOURCES = clip-in.c $(GTK_UTILS)
-
-trap_test_LDADD = $(GTK_LDADD)
-trap_test_SOURCES = trap-test.c $(GTK_UTILS)
-
-screen_test_LDADD = $(GTK_LDADD)
-screen_test_SOURCES = screen-test.c $(GTK_UTILS)
-
-convolution_test_LDADD = $(GTK_LDADD)
-convolution_test_SOURCES = convolution-test.c $(GTK_UTILS)
-
-endif
-
 # Benchmarks
 
 BENCHMARKS =			\
 	lowlevel-blt-bench
 
 lowlevel_blt_bench_SOURCES = lowlevel-blt-bench.c utils.c utils.h
-lowlevel_blt_bench_LDADD = $(TEST_LDADD)
 
-noinst_PROGRAMS = $(TESTPROGRAMS) $(TESTPROGRAMS_GTK) $(BENCHMARKS)
+noinst_PROGRAMS = $(TESTPROGRAMS) $(BENCHMARKS)
 
 TESTS = $(TESTPROGRAMS)
diff --git a/pixman/test/alpha-test.c b/pixman/test/alpha-test.c
deleted file mode 100644
index 92c208142..000000000
--- a/pixman/test/alpha-test.c
+++ /dev/null
@@ -1,117 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "pixman.h"
-#include "gtk-utils.h"
-
-int
-main (int argc, char **argv)
-{
-#define WIDTH 400
-#define HEIGHT 200
-    
-    uint32_t *alpha = malloc (WIDTH * HEIGHT * 4);
-    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
-    uint32_t *src = malloc (WIDTH * HEIGHT * 4);
-    pixman_image_t *grad_img;
-    pixman_image_t *alpha_img;
-    pixman_image_t *dest_img;
-    pixman_image_t *src_img;
-    int i;
-    pixman_gradient_stop_t stops[2] =
-	{
-	    { pixman_int_to_fixed (0), { 0x0000, 0x0000, 0x0000, 0x0000 } },
-	    { pixman_int_to_fixed (1), { 0xffff, 0x0000, 0x1111, 0xffff } }
-	};
-    pixman_point_fixed_t p1 = { pixman_double_to_fixed (0), 0 };
-    pixman_point_fixed_t p2 = { pixman_double_to_fixed (WIDTH),
-				pixman_int_to_fixed (0) };
-#if 0
-    pixman_transform_t trans = {
-	{ { pixman_double_to_fixed (2), pixman_double_to_fixed (0.5), pixman_double_to_fixed (-100), },
-	  { pixman_double_to_fixed (0), pixman_double_to_fixed (3), pixman_double_to_fixed (0), },
-	  { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } 
-	}
-    };
-#else
-    pixman_transform_t trans = {
-	{ { pixman_fixed_1, 0, 0 },
-	  { 0, pixman_fixed_1, 0 },
-	  { 0, 0, pixman_fixed_1 } }
-    };
-#endif
-
-    pixman_point_fixed_t c_inner;
-    pixman_point_fixed_t c_outer;
-    pixman_fixed_t r_inner;
-    pixman_fixed_t r_outer;
-    
-    for (i = 0; i < WIDTH * HEIGHT; ++i)
-	alpha[i] = 0x4f00004f; /* pale blue */
-    
-    alpha_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
-					 WIDTH, HEIGHT, 
-					  alpha,
-					 WIDTH * 4);
-
-    for (i = 0; i < WIDTH * HEIGHT; ++i)
-	dest[i] = 0xffffff00;		/* yellow */
-    
-    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
-					 WIDTH, HEIGHT, 
-					 dest,
-					 WIDTH * 4);
-
-    for (i = 0; i < WIDTH * HEIGHT; ++i)
-	src[i] = 0xffff0000;
-
-    src_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
-					WIDTH, HEIGHT,
-					src,
-					WIDTH * 4);
-    
-    c_inner.x = pixman_double_to_fixed (50.0);
-    c_inner.y = pixman_double_to_fixed (50.0);
-    c_outer.x = pixman_double_to_fixed (50.0);
-    c_outer.y = pixman_double_to_fixed (50.0);
-    r_inner = 0;
-    r_outer = pixman_double_to_fixed (50.0);
-    
-#if 0
-    grad_img = pixman_image_create_conical_gradient (&c_inner, r_inner,
-						    stops, 2);
-#endif
-#if 0
-    grad_img = pixman_image_create_conical_gradient (&c_inner, r_inner,
-						    stops, 2);
-    grad_img = pixman_image_create_linear_gradient (&c_inner, &c_outer,
-						   r_inner, r_outer,
-						   stops, 2);
-#endif
-    
-    grad_img = pixman_image_create_linear_gradient  (&p1, &p2,
-						    stops, 2);
-
-    pixman_image_set_transform (grad_img, &trans);
-    pixman_image_set_repeat (grad_img, PIXMAN_REPEAT_PAD);
-    
-    pixman_image_composite (PIXMAN_OP_OVER, grad_img, NULL, alpha_img,
-			    0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT);
-
-    pixman_image_set_alpha_map (src_img, alpha_img, 10, 10);
-    
-    pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dest_img,
-			    0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT);
-    
-    printf ("0, 0: %x\n", dest[0]);
-    printf ("10, 10: %x\n", dest[10 * 10 + 10]);
-    printf ("w, h: %x\n", dest[(HEIGHT - 1) * 100 + (WIDTH - 1)]);
-    
-    show_image (dest_img);
-
-    pixman_image_unref (src_img);
-    pixman_image_unref (grad_img);
-    pixman_image_unref (alpha_img);
-    free (dest);
-    
-    return 0;
-}
diff --git a/pixman/test/clip-in.c b/pixman/test/clip-in.c
deleted file mode 100644
index 51579811f..000000000
--- a/pixman/test/clip-in.c
+++ /dev/null
@@ -1,50 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "pixman.h"
-#include "gtk-utils.h"
-
-/* This test demonstrates that clipping is done totally different depending
- * on whether the source is transformed or not.
- */
-int
-main (int argc, char **argv)
-{
-#define WIDTH 200
-#define HEIGHT 200
-
-#define SMALL 25
-    
-    uint32_t *sbits = malloc (SMALL * SMALL * 4);
-    uint32_t *bits = malloc (WIDTH * HEIGHT * 4);
-    pixman_transform_t trans = {
-    {
-	{ pixman_double_to_fixed (1.0), pixman_double_to_fixed (0), pixman_double_to_fixed (-0.1), },
-	{ pixman_double_to_fixed (0), pixman_double_to_fixed (1), pixman_double_to_fixed (-0.1), },
-	{ pixman_double_to_fixed (0), pixman_double_to_fixed (0), pixman_double_to_fixed (1.0) }
-    } };
-	  
-    pixman_image_t *src_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, SMALL, SMALL, sbits, 4 * SMALL);
-    pixman_image_t *dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, 4 * WIDTH);
-
-    memset (bits, 0xff, WIDTH * HEIGHT * 4);
-    memset (sbits, 0x00, SMALL * SMALL * 4);
-
-    pixman_image_composite (PIXMAN_OP_IN,
-			    src_img, NULL, dest_img,
-			    0, 0, 0, 0, SMALL, SMALL, 200, 200);
-    
-    pixman_image_set_transform (src_img, &trans);
-    
-    pixman_image_composite (PIXMAN_OP_IN,
-			    src_img, NULL, dest_img,
-			    0, 0, 0, 0, SMALL * 2, SMALL * 2, 200, 200);
-    
-    show_image (dest_img);
-    
-    pixman_image_unref (src_img);
-    pixman_image_unref (dest_img);
-    free (bits);
-    
-    return 0;
-}
diff --git a/pixman/test/clip-test.c b/pixman/test/clip-test.c
deleted file mode 100644
index aa0df4482..000000000
--- a/pixman/test/clip-test.c
+++ /dev/null
@@ -1,97 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "pixman.h"
-#include "gtk-utils.h"
-
-#define WIDTH 200
-#define HEIGHT 200
-    
-static pixman_image_t *
-create_solid_bits (uint32_t pixel)
-{
-    uint32_t *pixels = malloc (WIDTH * HEIGHT * 4);
-    int i;
-    
-    for (i = 0; i < WIDTH * HEIGHT; ++i)
-	pixels[i] = pixel;
-
-    return pixman_image_create_bits (PIXMAN_a8r8g8b8,
-				     WIDTH, HEIGHT, 
-				     pixels,
-				     WIDTH * 4);
-}
-
-int
-main (int argc, char **argv)
-{
-    pixman_image_t *gradient_img;
-    pixman_image_t *src_img, *dst_img;
-    pixman_gradient_stop_t stops[2] =
-	{
-	    { pixman_int_to_fixed (0), { 0xffff, 0x0000, 0x0000, 0xffff } },
-	    { pixman_int_to_fixed (1), { 0xffff, 0xffff, 0x0000, 0xffff } }
-	};
-#if 0
-    pixman_point_fixed_t p1 = { 0, 0 };
-    pixman_point_fixed_t p2 = { pixman_int_to_fixed (WIDTH),
-				pixman_int_to_fixed (HEIGHT) };
-#endif
-    pixman_point_fixed_t c_inner;
-    pixman_point_fixed_t c_outer;
-    pixman_fixed_t r_inner;
-    pixman_fixed_t r_outer;
-    pixman_region32_t clip_region;
-    pixman_transform_t trans = {
-	{ { pixman_double_to_fixed (1.3), pixman_double_to_fixed (0), pixman_double_to_fixed (-0.5), },
-	  { pixman_double_to_fixed (0), pixman_double_to_fixed (1), pixman_double_to_fixed (-0.5), },
-	  { pixman_double_to_fixed (0), pixman_double_to_fixed (0), pixman_double_to_fixed (1.0) } 
-	}
-    };
-    
-    src_img = create_solid_bits (0xff0000ff);
-    
-    c_inner.x = pixman_double_to_fixed (100.0);
-    c_inner.y = pixman_double_to_fixed (100.0);
-    c_outer.x = pixman_double_to_fixed (100.0);
-    c_outer.y = pixman_double_to_fixed (100.0);
-    r_inner = 0;
-    r_outer = pixman_double_to_fixed (100.0);
-    
-    gradient_img = pixman_image_create_radial_gradient (&c_inner, &c_outer,
-							r_inner, r_outer,
-							stops, 2);
-
-#if 0
-    gradient_img = pixman_image_create_linear_gradient  (&p1, &p2,
-							 stops, 2);
-    
-#endif
-
-    pixman_image_composite (PIXMAN_OP_OVER, gradient_img, NULL, src_img,
-			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
-    
-    pixman_region32_init_rect (&clip_region, 50, 0, 100, 200);
-    pixman_image_set_clip_region32 (src_img, &clip_region);
-    pixman_image_set_source_clipping (src_img, TRUE);
-    pixman_image_set_has_client_clip (src_img, TRUE);
-    pixman_image_set_transform (src_img, &trans);
-    pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NORMAL);
-    
-    dst_img = create_solid_bits (0xffff0000);
-    pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dst_img,
-			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
-    
-
-#if 0
-    printf ("0, 0: %x\n", src[0]);
-    printf ("10, 10: %x\n", src[10 * 10 + 10]);
-    printf ("w, h: %x\n", src[(HEIGHT - 1) * 100 + (WIDTH - 1)]);
-#endif
-    
-    show_image (dst_img);
-    
-    pixman_image_unref (gradient_img);
-    pixman_image_unref (src_img);
-    
-    return 0;
-}
diff --git a/pixman/test/composite-test.c b/pixman/test/composite-test.c
deleted file mode 100644
index 79d5d5eac..000000000
--- a/pixman/test/composite-test.c
+++ /dev/null
@@ -1,191 +0,0 @@
-#include <gtk/gtk.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "pixman.h"
-#include "gtk-utils.h"
-
-#define WIDTH	60
-#define HEIGHT	60
-
-typedef struct {
-    const char *name;
-    pixman_op_t op;
-} operator_t;
-
-static const operator_t operators[] = {
-    { "CLEAR",		PIXMAN_OP_CLEAR },
-    { "SRC",		PIXMAN_OP_SRC },
-    { "DST",		PIXMAN_OP_DST },
-    { "OVER",		PIXMAN_OP_OVER },
-    { "OVER_REVERSE",	PIXMAN_OP_OVER_REVERSE },
-    { "IN",		PIXMAN_OP_IN },
-    { "IN_REVERSE",	PIXMAN_OP_IN_REVERSE },
-    { "OUT",		PIXMAN_OP_OUT },
-    { "OUT_REVERSE",	PIXMAN_OP_OUT_REVERSE },
-    { "ATOP",		PIXMAN_OP_ATOP },
-    { "ATOP_REVERSE",	PIXMAN_OP_ATOP_REVERSE },
-    { "XOR",		PIXMAN_OP_XOR },
-    { "ADD",		PIXMAN_OP_ADD },
-    { "SATURATE",	PIXMAN_OP_SATURATE },
-
-    { "MULTIPLY",	PIXMAN_OP_MULTIPLY },
-    { "SCREEN",		PIXMAN_OP_SCREEN },
-    { "OVERLAY",	PIXMAN_OP_OVERLAY },
-    { "DARKEN",		PIXMAN_OP_DARKEN },
-    { "LIGHTEN",	PIXMAN_OP_LIGHTEN },
-    { "COLOR_DODGE",	PIXMAN_OP_COLOR_DODGE },
-    { "COLOR_BURN",	PIXMAN_OP_COLOR_BURN },
-    { "HARD_LIGHT",	PIXMAN_OP_HARD_LIGHT },
-    { "SOFT_LIGHT",	PIXMAN_OP_SOFT_LIGHT },
-    { "DIFFERENCE",	PIXMAN_OP_DIFFERENCE },
-    { "EXCLUSION",	PIXMAN_OP_EXCLUSION },
-    { "HSL_HUE",	PIXMAN_OP_HSL_HUE },
-    { "HSL_SATURATION",	PIXMAN_OP_HSL_SATURATION },
-    { "HSL_COLOR",	PIXMAN_OP_HSL_COLOR },
-    { "HSL_LUMINOSITY",	PIXMAN_OP_HSL_LUMINOSITY },
-};
-
-static uint32_t
-reader (const void *src, int size)
-{
-    switch (size)
-    {
-    case 1:
-	return *(uint8_t *)src;
-    case 2:
-	return *(uint16_t *)src;
-    case 4:
-	return *(uint32_t *)src;
-    default:
-	g_assert_not_reached();
-    }
-}
-
-static void
-writer (void *src, uint32_t value, int size)
-{
-    switch (size)
-    {
-    case 1:
-	*(uint8_t *)src = value;
-	break;
-
-    case 2:
-	*(uint16_t *)src = value;
-	break;
-
-    case 4:
-	*(uint32_t *)src = value;
-	break;
-
-    default:
-        break;
-    }
-}
-
-int
-main (int argc, char **argv)
-{
-#define d2f pixman_double_to_fixed
-    
-    GtkWidget *window, *swindow;
-    GtkWidget *table;
-    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
-    uint32_t *src = malloc (WIDTH * HEIGHT * 4);
-    pixman_image_t *src_img;
-    pixman_image_t *dest_img;
-    pixman_point_fixed_t p1 = { -10 << 0, 0 };
-    pixman_point_fixed_t p2 = { WIDTH << 16, (HEIGHT - 10) << 16 };
-    uint16_t full = 0xcfff;
-    uint16_t low  = 0x5000;
-    uint16_t alpha = 0xffff;
-    pixman_gradient_stop_t stops[6] =
-    {
-	{ d2f (0.0), { full, low, low, alpha } },
-	{ d2f (0.25), { full, full, low, alpha } },
-	{ d2f (0.4), { low, full, low, alpha } },
-	{ d2f (0.6), { low, full, full, alpha } },
-	{ d2f (0.8), { low, low, full, alpha } },
-	{ d2f (1.0), { full, low, full, alpha } },
-    };
-
-    int i;
-
-    gtk_init (&argc, &argv);
-
-    window = gtk_window_new (GTK_WINDOW_TOPLEVEL);
-
-    gtk_window_set_default_size (GTK_WINDOW (window), 800, 600);
-    
-    g_signal_connect (window, "delete-event",
-		      G_CALLBACK (gtk_main_quit),
-		      NULL);
-    table = gtk_table_new (G_N_ELEMENTS (operators) / 6, 6, TRUE);
-
-    src_img = pixman_image_create_linear_gradient (&p1, &p2, stops,
-						   sizeof (stops) / sizeof (stops[0]));
-
-    pixman_image_set_repeat (src_img, PIXMAN_REPEAT_PAD);
-    
-    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
-					 WIDTH, HEIGHT,
-					 dest,
-					 WIDTH * 4);
-    pixman_image_set_accessors (dest_img, reader, writer);
-
-    for (i = 0; i < G_N_ELEMENTS (operators); ++i)
-    {
-	GtkWidget *image;
-	GdkPixbuf *pixbuf;
-	GtkWidget *vbox;
-	GtkWidget *label;
-	int j, k;
-
-	vbox = gtk_vbox_new (FALSE, 0);
-
-	label = gtk_label_new (operators[i].name);
-	gtk_box_pack_start (GTK_BOX (vbox), label, FALSE, FALSE, 6);
-	gtk_widget_show (label);
-
-	for (j = 0; j < HEIGHT; ++j)
-	{
-	    for (k = 0; k < WIDTH; ++k)
-		dest[j * WIDTH + k] = 0x7f6f6f00;
-	}
-	pixman_image_composite (operators[i].op, src_img, NULL, dest_img,
-				0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
-	pixbuf = pixbuf_from_argb32 (pixman_image_get_data (dest_img), TRUE,
-				     WIDTH, HEIGHT, WIDTH * 4);
-	image = gtk_image_new_from_pixbuf (pixbuf);
-	gtk_box_pack_start (GTK_BOX (vbox), image, FALSE, FALSE, 0);
-	gtk_widget_show (image);
-
-	gtk_table_attach_defaults (GTK_TABLE (table), vbox,
-				   i % 6, (i % 6) + 1, i / 6, (i / 6) + 1);
-	gtk_widget_show (vbox);
-
-	g_object_unref (pixbuf);
-    }
-
-    pixman_image_unref (src_img);
-    free (src);
-    pixman_image_unref (dest_img);
-    free (dest);
-
-    swindow = gtk_scrolled_window_new (NULL, NULL);
-    gtk_scrolled_window_set_policy (GTK_SCROLLED_WINDOW (swindow),
-				    GTK_POLICY_AUTOMATIC,
-				    GTK_POLICY_AUTOMATIC);
-    
-    gtk_scrolled_window_add_with_viewport (GTK_SCROLLED_WINDOW (swindow), table);
-    gtk_widget_show (table);
-
-    gtk_container_add (GTK_CONTAINER (window), swindow);
-    gtk_widget_show (swindow);
-
-    gtk_widget_show (window);
-
-    gtk_main ();
-
-    return 0;
-}
diff --git a/pixman/test/convolution-test.c b/pixman/test/convolution-test.c
deleted file mode 100644
index da284af7b..000000000
--- a/pixman/test/convolution-test.c
+++ /dev/null
@@ -1,47 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "pixman.h"
-#include "gtk-utils.h"
-
-int
-main (int argc, char **argv)
-{
-#define WIDTH 200
-#define HEIGHT 200
-
-#define d2f pixman_double_to_fixed
-    
-    uint32_t *src = malloc (WIDTH * HEIGHT * 4);
-    uint32_t *mask = malloc (WIDTH * HEIGHT * 4);
-    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
-    pixman_fixed_t convolution[] =
-    {
-	d2f (3), d2f (3),
-	d2f (0.5), d2f (0.5), d2f (0.5),
-	d2f (0.5), d2f (0.5), d2f (0.5),
-	d2f (0.5), d2f (0.5), d2f (0.5),
-    };
-    pixman_image_t *simg, *mimg, *dimg;
-
-    int i;
-
-    for (i = 0; i < WIDTH * HEIGHT; ++i)
-    {
-	src[i] = 0x7f007f00;
-	mask[i] = (i % 256) * 0x01000000;
-	dest[i] = 0;
-    }
-
-    simg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src, WIDTH * 4);
-    mimg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, mask, WIDTH * 4);
-    dimg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, dest, WIDTH * 4);
-
-    pixman_image_set_filter (mimg, PIXMAN_FILTER_CONVOLUTION,
-			     convolution, 11);
-
-    pixman_image_composite (PIXMAN_OP_OVER, simg, mimg, dimg, 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
-
-    show_image (dimg);
-    
-    return 0;
-}
diff --git a/pixman/test/gradient-test.c b/pixman/test/gradient-test.c
deleted file mode 100644
index fc84844b0..000000000
--- a/pixman/test/gradient-test.c
+++ /dev/null
@@ -1,89 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "pixman.h"
-#include "gtk-utils.h"
-
-int
-main (int argc, char **argv)
-{
-#define WIDTH 400
-#define HEIGHT 200
-    
-    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
-    pixman_image_t *src_img;
-    pixman_image_t *dest_img;
-    int i;
-    pixman_gradient_stop_t stops[2] =
-	{
-	    { pixman_int_to_fixed (0), { 0xffff, 0xeeee, 0xeeee, 0xeeee } },
-	    { pixman_int_to_fixed (1), { 0xffff, 0x1111, 0x1111, 0x1111 } }
-	};
-    pixman_point_fixed_t p1 = { pixman_double_to_fixed (0), 0 };
-    pixman_point_fixed_t p2 = { pixman_double_to_fixed (WIDTH / 8.),
-				pixman_int_to_fixed (0) };
-#if 0
-    pixman_transform_t trans = {
-	{ { pixman_double_to_fixed (2), pixman_double_to_fixed (0.5), pixman_double_to_fixed (-100), },
-	  { pixman_double_to_fixed (0), pixman_double_to_fixed (3), pixman_double_to_fixed (0), },
-	  { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } 
-	}
-    };
-#else
-    pixman_transform_t trans = {
-	{ { pixman_fixed_1, 0, 0 },
-	  { 0, pixman_fixed_1, 0 },
-	  { 0, 0, pixman_fixed_1 } }
-    };
-#endif
-
-    pixman_point_fixed_t c_inner;
-    pixman_point_fixed_t c_outer;
-    pixman_fixed_t r_inner;
-    pixman_fixed_t r_outer;
-    
-    for (i = 0; i < WIDTH * HEIGHT; ++i)
-	dest[i] = 0x4f00004f; /* pale blue */
-    
-    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
-					 WIDTH, HEIGHT, 
-					 dest,
-					 WIDTH * 4);
-
-    c_inner.x = pixman_double_to_fixed (50.0);
-    c_inner.y = pixman_double_to_fixed (50.0);
-    c_outer.x = pixman_double_to_fixed (50.0);
-    c_outer.y = pixman_double_to_fixed (50.0);
-    r_inner = 0;
-    r_outer = pixman_double_to_fixed (50.0);
-    
-    src_img = pixman_image_create_conical_gradient (&c_inner, r_inner,
-						    stops, 2);
-#if 0
-    src_img = pixman_image_create_conical_gradient (&c_inner, r_inner,
-						    stops, 2);
-    src_img = pixman_image_create_linear_gradient (&c_inner, &c_outer,
-						   r_inner, r_outer,
-						   stops, 2);
-#endif
-    
-    src_img = pixman_image_create_linear_gradient  (&p1, &p2,
-						    stops, 2);
-    
-    pixman_image_set_transform (src_img, &trans);
-    pixman_image_set_repeat (src_img, PIXMAN_REPEAT_PAD);
-    
-    pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dest_img,
-			    0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT);
-    
-    printf ("0, 0: %x\n", dest[0]);
-    printf ("10, 10: %x\n", dest[10 * 10 + 10]);
-    printf ("w, h: %x\n", dest[(HEIGHT - 1) * 100 + (WIDTH - 1)]);
-    
-    show_image (dest_img);
-    
-    pixman_image_unref (src_img);
-    pixman_image_unref (dest_img);
-    free (dest);
-    
-    return 0;
-}
diff --git a/pixman/test/gtk-utils.c b/pixman/test/gtk-utils.c
deleted file mode 100644
index f45cdc912..000000000
--- a/pixman/test/gtk-utils.c
+++ /dev/null
@@ -1,115 +0,0 @@
-#include <gtk/gtk.h>
-#include <config.h>
-#include "pixman-private.h"	/* For image->bits.format
-				 * FIXME: there should probably be public API for this
-				 */
-#include "gtk-utils.h"
-
-GdkPixbuf *
-pixbuf_from_argb32 (uint32_t *bits,
-		    gboolean has_alpha,
-		    int width,
-		    int height,
-		    int stride)
-{
-    GdkPixbuf *pixbuf = gdk_pixbuf_new (GDK_COLORSPACE_RGB, TRUE,
-					8, width, height);
-    int p_stride = gdk_pixbuf_get_rowstride (pixbuf);
-    guint32 *p_bits = (guint32 *)gdk_pixbuf_get_pixels (pixbuf);
-    int w, h;
-    
-    for (h = 0; h < height; ++h)
-    {
-	for (w = 0; w < width; ++w)
-	{
-	    uint32_t argb = bits[h * (stride / 4) + w];
-	    guint r, g, b, a;
-	    char *pb = (char *)p_bits;
-
-	    pb += h * p_stride + w * 4;
-
-	    r = (argb & 0x00ff0000) >> 16;
-	    g = (argb & 0x0000ff00) >> 8;
-	    b = (argb & 0x000000ff) >> 0;
-	    a = has_alpha? (argb & 0xff000000) >> 24 : 0xff;
-
-	    if (a)
-	    {
-		r = (r * 255) / a;
-		g = (g * 255) / a;
-		b = (b * 255) / a;
-	    }
-
-	    if (r > 255) r = 255;
-	    if (g > 255) g = 255;
-	    if (b > 255) b = 255;
-	    
-	    pb[0] = r;
-	    pb[1] = g;
-	    pb[2] = b;
-	    pb[3] = a;
-	}
-    }
-    
-    return pixbuf;
-}
-
-
-static gboolean
-on_expose (GtkWidget *widget, GdkEventExpose *expose, gpointer data)
-{
-    GdkPixbuf *pixbuf = data;
-    
-    gdk_draw_pixbuf (widget->window, NULL,
-		     pixbuf, 0, 0, 0, 0,
-		     gdk_pixbuf_get_width (pixbuf),
-		     gdk_pixbuf_get_height (pixbuf),
-		     GDK_RGB_DITHER_NONE,
-		     0, 0);
-    
-    return TRUE;
-}
-
-void
-show_image (pixman_image_t *image)
-{
-    GtkWidget *window;
-    GdkPixbuf *pixbuf;
-    int width, height, stride;
-    int argc;
-    char **argv;
-    char *arg0 = g_strdup ("pixman-test-program");
-    gboolean has_alpha;
-    pixman_format_code_t format;
-
-    argc = 1;
-    argv = (char **)&arg0;
-
-    gtk_init (&argc, &argv);
-    
-    window = gtk_window_new (GTK_WINDOW_TOPLEVEL);
-    width = pixman_image_get_width (image);
-    height = pixman_image_get_height (image);
-    stride = pixman_image_get_stride (image);
-
-    gtk_window_set_default_size (GTK_WINDOW (window), width, height);
-    
-    format = image->bits.format;
-    
-    if (format == PIXMAN_a8r8g8b8)
-	has_alpha = TRUE;
-    else if (format == PIXMAN_x8r8g8b8)
-	has_alpha = FALSE;
-    else
-	g_error ("Can't deal with this format: %x\n", format);
-    
-    pixbuf = pixbuf_from_argb32 (pixman_image_get_data (image), has_alpha,
-				 width, height, stride);
-    
-    g_signal_connect (window, "expose_event", G_CALLBACK (on_expose), pixbuf);
-    g_signal_connect (window, "delete_event", G_CALLBACK (gtk_main_quit), NULL);
-    
-    gtk_widget_show (window);
-    
-    gtk_main ();
-}
diff --git a/pixman/test/gtk-utils.h b/pixman/test/gtk-utils.h
deleted file mode 100644
index 2cb13bcf0..000000000
--- a/pixman/test/gtk-utils.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <glib.h>
-#include <gtk/gtk.h>
-#include "pixman.h"
-
-void show_image (pixman_image_t *image);
-
-GdkPixbuf *pixbuf_from_argb32 (uint32_t *bits,
-		               gboolean has_alpha,
-                               int width,
-                               int height,
-                               int stride);
diff --git a/pixman/test/radial-test.c b/pixman/test/radial-test.c
deleted file mode 100644
index 5d716c339..000000000
--- a/pixman/test/radial-test.c
+++ /dev/null
@@ -1,198 +0,0 @@
-#include "utils.h"
-#include "gtk-utils.h"
-
-#define NUM_GRADIENTS 7
-#define NUM_STOPS 3
-#define NUM_REPEAT 4
-#define SIZE 128
-#define WIDTH (SIZE * NUM_GRADIENTS)
-#define HEIGHT (SIZE * NUM_REPEAT)
-
-/*
- * We want to test all the possible relative positions of the start
- * and end circle:
- *
- *  - The start circle can be smaller/equal/bigger than the end
- *    circle. A radial gradient can be classified in one of these
- *    three cases depending on the sign of dr.
- *
- *  - The smaller circle can be completely inside/internally
- *    tangent/outside (at least in part) of the bigger circle. This
- *    classification is the same as the one which can be computed by
- *    examining the sign of a = (dx^2 + dy^2 - dr^2).
- *
- *  - If the two circles have the same size, neither can be inside or
- *    internally tangent
- *
- * This test draws radial gradients whose circles always have the same
- * centers (0, 0) and (1, 0), but with different radiuses. From left
- * to right:
- *
- * - Small start circle completely inside the end circle
- *     0.25 -> 1.75; dr =  1.5 > 0; a = 1 - 1.50^2 < 0
- *
- * - Small start circle internally tangent to the end circle
- *     0.50 -> 1.50; dr =  1.0 > 0; a = 1 - 1.00^2 = 0
- *
- * - Small start circle outside of the end circle
- *     0.50 -> 1.00; dr =  0.5 > 0; a = 1 - 0.50^2 > 0
- *
- * - Start circle with the same size as the end circle
- *     1.00 -> 1.00; dr =  0.0 = 0; a = 1 - 0.00^2 > 0
- *
- * - Small end circle outside of the start circle
- *     1.00 -> 0.50; dr = -0.5 > 0; a = 1 - 0.50^2 > 0
- *
- * - Small end circle internally tangent to the start circle
- *     1.50 -> 0.50; dr = -1.0 > 0; a = 1 - 1.00^2 = 0
- *
- * - Small end circle completely inside the start circle
- *     1.75 -> 0.25; dr = -1.5 > 0; a = 1 - 1.50^2 < 0
- *
- */
-
-const static double radiuses[NUM_GRADIENTS] = {
-    0.25,
-    0.50,
-    0.50,
-    1.00,
-    1.00,
-    1.50,
-    1.75
-};
-
-#define double_to_color(x)					\
-    (((uint32_t) ((x)*65536)) - (((uint32_t) ((x)*65536)) >> 16))
-
-#define PIXMAN_STOP(offset,r,g,b,a)		\
-    { pixman_double_to_fixed (offset),		\
-	{					\
-	double_to_color (r),			\
-	double_to_color (g),			\
-	double_to_color (b),			\
-	double_to_color (a)			\
-	}					\
-    }
-
-static const pixman_gradient_stop_t stops[NUM_STOPS] = {
-    PIXMAN_STOP (0.0,        1, 0, 0, 0.75),
-    PIXMAN_STOP (0.70710678, 0, 1, 0, 0),
-    PIXMAN_STOP (1.0,        0, 0, 1, 1)
-};
-
-static pixman_image_t *
-create_radial (int index)
-{
-    pixman_point_fixed_t p0, p1;
-    pixman_fixed_t r0, r1;
-    double x0, x1, radius0, radius1, left, right, center;
-
-    x0 = 0;
-    x1 = 1;
-    radius0 = radiuses[index];
-    radius1 = radiuses[NUM_GRADIENTS - index - 1];
-
-    /* center the gradient */
-    left = MIN (x0 - radius0, x1 - radius1);
-    right = MAX (x0 + radius0, x1 + radius1);
-    center = (left + right) * 0.5;
-    x0 -= center;
-    x1 -= center;
-
-    /* scale to make it fit within a 1x1 rect centered in (0,0) */
-    x0 *= 0.25;
-    x1 *= 0.25;
-    radius0 *= 0.25;
-    radius1 *= 0.25;
-
-    p0.x = pixman_double_to_fixed (x0);
-    p0.y = pixman_double_to_fixed (0);
-
-    p1.x = pixman_double_to_fixed (x1);
-    p1.y = pixman_double_to_fixed (0);
-
-    r0 = pixman_double_to_fixed (radius0);
-    r1 = pixman_double_to_fixed (radius1);
-
-    return pixman_image_create_radial_gradient (&p0, &p1,
-						r0, r1,
-						stops, NUM_STOPS);
-}
-
-static const pixman_repeat_t repeat[NUM_REPEAT] = {
-    PIXMAN_REPEAT_NONE,
-    PIXMAN_REPEAT_NORMAL,
-    PIXMAN_REPEAT_REFLECT,
-    PIXMAN_REPEAT_PAD
-};
-
-int
-main (int argc, char **argv)
-{
-    pixman_transform_t transform;
-    pixman_image_t *src_img, *dest_img;
-    int i, j;
-
-    enable_fp_exceptions ();
-
-    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
-					 WIDTH, HEIGHT,
-					 NULL, 0);
-
-    pixman_transform_init_identity (&transform);
-
-    /*
-     * The create_radial() function returns gradients centered in the
-     * origin and whose interesting part fits a 1x1 square. We want to
-     * paint these gradients on a SIZExSIZE square and to make things
-     * easier we want the origin in the top-left corner of the square
-     * we want to see.
-     */
-    pixman_transform_translate (NULL, &transform,
-				pixman_double_to_fixed (0.5),
-				pixman_double_to_fixed (0.5));
-
-    pixman_transform_scale (NULL, &transform,
-			    pixman_double_to_fixed (SIZE),
-			    pixman_double_to_fixed (SIZE));
-
-    /*
-     * Gradients are evaluated at the center of each pixel, so we need
-     * to translate by half a pixel to trigger some interesting
-     * cornercases. In particular, the original implementation of PDF
-     * radial gradients tried to divide by 0 when using this transform
-     * on the "tangent circles" cases.
-     */
-    pixman_transform_translate (NULL, &transform,
-				pixman_double_to_fixed (0.5),
-				pixman_double_to_fixed (0.5));
-
-    for (i = 0; i < NUM_GRADIENTS; i++)
-    {
-	src_img = create_radial (i);
-	pixman_image_set_transform (src_img, &transform);
-
-	for (j = 0; j < NUM_REPEAT; j++)
-	{
-	    pixman_image_set_repeat (src_img, repeat[j]);
-
-	    pixman_image_composite32 (PIXMAN_OP_OVER,
-				      src_img,
-				      NULL,
-				      dest_img,
-				      0, 0,
-				      0, 0,
-				      i * SIZE, j * SIZE,
-				      SIZE, SIZE);
-
-	}
-
-	pixman_image_unref (src_img);
-    }
-
-    show_image (dest_img);
-
-    pixman_image_unref (dest_img);
-
-    return 0;
-}
diff --git a/pixman/test/scaling-test.c b/pixman/test/scaling-test.c
index 7b78017a3..dbb9d39b0 100644
--- a/pixman/test/scaling-test.c
+++ b/pixman/test/scaling-test.c
@@ -1,250 +1,368 @@
-/*
- * Test program, which can detect some problems with nearest neighbour
- * and bilinear scaling in pixman. Testing is done by running lots
- * of random SRC and OVER compositing operations a8r8g8b8, x8a8r8g8b8
- * and r5g6b5 color formats.
- *
- * Script 'fuzzer-find-diff.pl' can be used to narrow down the problem in
- * the case of test failure.
- */
-#include <assert.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "utils.h"
-
-#define MAX_SRC_WIDTH  16
-#define MAX_SRC_HEIGHT 16
-#define MAX_DST_WIDTH  16
-#define MAX_DST_HEIGHT 16
-#define MAX_STRIDE     4
-
-/*
- * Composite operation with pseudorandom images
- */
-uint32_t
-test_composite (int      testnum,
-		int      verbose)
-{
-    int                i;
-    pixman_image_t *   src_img;
-    pixman_image_t *   dst_img;
-    pixman_transform_t transform;
-    pixman_region16_t  clip;
-    int                src_width, src_height;
-    int                dst_width, dst_height;
-    int                src_stride, dst_stride;
-    int                src_x, src_y;
-    int                dst_x, dst_y;
-    int                src_bpp;
-    int                dst_bpp;
-    int                w, h;
-    pixman_fixed_t     scale_x = 65536, scale_y = 65536;
-    pixman_fixed_t     translate_x = 0, translate_y = 0;
-    pixman_op_t        op;
-    pixman_repeat_t    repeat = PIXMAN_REPEAT_NONE;
-    pixman_format_code_t src_fmt, dst_fmt;
-    uint32_t *         srcbuf;
-    uint32_t *         dstbuf;
-    uint32_t           crc32;
-    FLOAT_REGS_CORRUPTION_DETECTOR_START ();
-
-    lcg_srand (testnum);
-
-    src_bpp = (lcg_rand_n (2) == 0) ? 2 : 4;
-    dst_bpp = (lcg_rand_n (2) == 0) ? 2 : 4;
-    op = (lcg_rand_n (2) == 0) ? PIXMAN_OP_SRC : PIXMAN_OP_OVER;
-
-    src_width = lcg_rand_n (MAX_SRC_WIDTH) + 1;
-    src_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1;
-    dst_width = lcg_rand_n (MAX_DST_WIDTH) + 1;
-    dst_height = lcg_rand_n (MAX_DST_HEIGHT) + 1;
-    src_stride = src_width * src_bpp + lcg_rand_n (MAX_STRIDE) * src_bpp;
-    dst_stride = dst_width * dst_bpp + lcg_rand_n (MAX_STRIDE) * dst_bpp;
-
-    if (src_stride & 3)
-	src_stride += 2;
-
-    if (dst_stride & 3)
-	dst_stride += 2;
-
-    src_x = -(src_width / 4) + lcg_rand_n (src_width * 3 / 2);
-    src_y = -(src_height / 4) + lcg_rand_n (src_height * 3 / 2);
-    dst_x = -(dst_width / 4) + lcg_rand_n (dst_width * 3 / 2);
-    dst_y = -(dst_height / 4) + lcg_rand_n (dst_height * 3 / 2);
-    w = lcg_rand_n (dst_width * 3 / 2 - dst_x);
-    h = lcg_rand_n (dst_height * 3 / 2 - dst_y);
-
-    srcbuf = (uint32_t *)malloc (src_stride * src_height);
-    dstbuf = (uint32_t *)malloc (dst_stride * dst_height);
-
-    for (i = 0; i < src_stride * src_height; i++)
-	*((uint8_t *)srcbuf + i) = lcg_rand_n (256);
-
-    for (i = 0; i < dst_stride * dst_height; i++)
-	*((uint8_t *)dstbuf + i) = lcg_rand_n (256);
-
-    src_fmt = src_bpp == 4 ? (lcg_rand_n (2) == 0 ?
-                              PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
-
-    dst_fmt = dst_bpp == 4 ? (lcg_rand_n (2) == 0 ?
-                              PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
-
-    src_img = pixman_image_create_bits (
-        src_fmt, src_width, src_height, srcbuf, src_stride);
-
-    dst_img = pixman_image_create_bits (
-        dst_fmt, dst_width, dst_height, dstbuf, dst_stride);
-
-    image_endian_swap (src_img, src_bpp * 8);
-    image_endian_swap (dst_img, dst_bpp * 8);
-
-    if (lcg_rand_n (8) > 0)
-    {
-	scale_x = -32768 * 3 + lcg_rand_N (65536 * 5);
-	scale_y = -32768 * 3 + lcg_rand_N (65536 * 5);
-	translate_x = lcg_rand_N (65536);
-	translate_y = lcg_rand_N (65536);
-	pixman_transform_init_scale (&transform, scale_x, scale_y);
-	pixman_transform_translate (&transform, NULL, translate_x, translate_y);
-	pixman_image_set_transform (src_img, &transform);
-    }
-
-    switch (lcg_rand_n (4))
-    {
-    case 0:
-	repeat = PIXMAN_REPEAT_NONE;
-	break;
-
-    case 1:
-	repeat = PIXMAN_REPEAT_NORMAL;
-	break;
-
-    case 2:
-	repeat = PIXMAN_REPEAT_PAD;
-	break;
-
-    case 3:
-	repeat = PIXMAN_REPEAT_REFLECT;
-	break;
-
-    default:
-        break;
-    }
-    pixman_image_set_repeat (src_img, repeat);
-
-    if (lcg_rand_n (2))
-	pixman_image_set_filter (src_img, PIXMAN_FILTER_NEAREST, NULL, 0);
-    else
-	pixman_image_set_filter (src_img, PIXMAN_FILTER_BILINEAR, NULL, 0);
-
-    if (verbose)
-    {
-	printf ("src_fmt=%08X, dst_fmt=%08X\n", src_fmt, dst_fmt);
-	printf ("op=%d, scale_x=%d, scale_y=%d, repeat=%d\n",
-	        op, scale_x, scale_y, repeat);
-	printf ("translate_x=%d, translate_y=%d\n",
-	        translate_x, translate_y);
-	printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
-	        src_width, src_height, dst_width, dst_height);
-	printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
-	        src_x, src_y, dst_x, dst_y);
-	printf ("w=%d, h=%d\n", w, h);
-    }
-
-    if (lcg_rand_n (8) == 0)
-    {
-	pixman_box16_t clip_boxes[2];
-	int            n = lcg_rand_n (2) + 1;
-
-	for (i = 0; i < n; i++)
-	{
-	    clip_boxes[i].x1 = lcg_rand_n (src_width);
-	    clip_boxes[i].y1 = lcg_rand_n (src_height);
-	    clip_boxes[i].x2 =
-		clip_boxes[i].x1 + lcg_rand_n (src_width - clip_boxes[i].x1);
-	    clip_boxes[i].y2 =
-		clip_boxes[i].y1 + lcg_rand_n (src_height - clip_boxes[i].y1);
-
-	    if (verbose)
-	    {
-		printf ("source clip box: [%d,%d-%d,%d]\n",
-		        clip_boxes[i].x1, clip_boxes[i].y1,
-		        clip_boxes[i].x2, clip_boxes[i].y2);
-	    }
-	}
-
-	pixman_region_init_rects (&clip, clip_boxes, n);
-	pixman_image_set_clip_region (src_img, &clip);
-	pixman_image_set_source_clipping (src_img, 1);
-	pixman_region_fini (&clip);
-    }
-
-    if (lcg_rand_n (8) == 0)
-    {
-	pixman_box16_t clip_boxes[2];
-	int            n = lcg_rand_n (2) + 1;
-	for (i = 0; i < n; i++)
-	{
-	    clip_boxes[i].x1 = lcg_rand_n (dst_width);
-	    clip_boxes[i].y1 = lcg_rand_n (dst_height);
-	    clip_boxes[i].x2 =
-		clip_boxes[i].x1 + lcg_rand_n (dst_width - clip_boxes[i].x1);
-	    clip_boxes[i].y2 =
-		clip_boxes[i].y1 + lcg_rand_n (dst_height - clip_boxes[i].y1);
-
-	    if (verbose)
-	    {
-		printf ("destination clip box: [%d,%d-%d,%d]\n",
-		        clip_boxes[i].x1, clip_boxes[i].y1,
-		        clip_boxes[i].x2, clip_boxes[i].y2);
-	    }
-	}
-	pixman_region_init_rects (&clip, clip_boxes, n);
-	pixman_image_set_clip_region (dst_img, &clip);
-	pixman_region_fini (&clip);
-    }
-
-    pixman_image_composite (op, src_img, NULL, dst_img,
-                            src_x, src_y, 0, 0, dst_x, dst_y, w, h);
-
-    if (dst_fmt == PIXMAN_x8r8g8b8)
-    {
-	/* ignore unused part */
-	for (i = 0; i < dst_stride * dst_height / 4; i++)
-	    dstbuf[i] &= 0xFFFFFF;
-    }
-
-    image_endian_swap (dst_img, dst_bpp * 8);
-
-    if (verbose)
-    {
-	int j;
-	
-	for (i = 0; i < dst_height; i++)
-	{
-	    for (j = 0; j < dst_stride; j++)
-		printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
-
-	    printf ("\n");
-	}
-    }
-
-    pixman_image_unref (src_img);
-    pixman_image_unref (dst_img);
-
-    crc32 = compute_crc32 (0, dstbuf, dst_stride * dst_height);
-    free (srcbuf);
-    free (dstbuf);
-
-    FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
-    return crc32;
-}
-
-int
-main (int argc, const char *argv[])
-{
-    pixman_disable_out_of_bounds_workaround ();
-
-    return fuzzer_test_main("scaling", 8000000, 0x7F1AB59F,
-			    test_composite, argc, argv);
-}
+/*
+ * Test program, which can detect some problems with nearest neighbour
+ * and bilinear scaling in pixman. Testing is done by running lots
+ * of random SRC and OVER compositing operations a8r8g8b8, x8a8r8g8b8
+ * and r5g6b5 color formats.
+ *
+ * Script 'fuzzer-find-diff.pl' can be used to narrow down the problem in
+ * the case of test failure.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+#define MAX_SRC_WIDTH  48
+#define MAX_SRC_HEIGHT 8
+#define MAX_DST_WIDTH  48
+#define MAX_DST_HEIGHT 8
+#define MAX_STRIDE     4
+
+/*
+ * Composite operation with pseudorandom images
+ */
+uint32_t
+test_composite (int      testnum,
+		int      verbose)
+{
+    int                i;
+    pixman_image_t *   src_img;
+    pixman_image_t *   mask_img;
+    pixman_image_t *   dst_img;
+    pixman_transform_t transform;
+    pixman_region16_t  clip;
+    int                src_width, src_height;
+    int                mask_width, mask_height;
+    int                dst_width, dst_height;
+    int                src_stride, mask_stride, dst_stride;
+    int                src_x, src_y;
+    int                mask_x, mask_y;
+    int                dst_x, dst_y;
+    int                src_bpp;
+    int                mask_bpp = 1;
+    int                dst_bpp;
+    int                w, h;
+    pixman_fixed_t     scale_x = 65536, scale_y = 65536;
+    pixman_fixed_t     translate_x = 0, translate_y = 0;
+    pixman_fixed_t     mask_scale_x = 65536, mask_scale_y = 65536;
+    pixman_fixed_t     mask_translate_x = 0, mask_translate_y = 0;
+    pixman_op_t        op;
+    pixman_repeat_t    repeat = PIXMAN_REPEAT_NONE;
+    pixman_repeat_t    mask_repeat = PIXMAN_REPEAT_NONE;
+    pixman_format_code_t src_fmt, dst_fmt;
+    uint32_t *         srcbuf;
+    uint32_t *         dstbuf;
+    uint32_t *         maskbuf;
+    uint32_t           crc32;
+    FLOAT_REGS_CORRUPTION_DETECTOR_START ();
+
+    lcg_srand (testnum);
+
+    src_bpp = (lcg_rand_n (2) == 0) ? 2 : 4;
+    dst_bpp = (lcg_rand_n (2) == 0) ? 2 : 4;
+    switch (lcg_rand_n (3))
+    {
+    case 0:
+	op = PIXMAN_OP_SRC;
+	break;
+    case 1:
+	op = PIXMAN_OP_OVER;
+	break;
+    default:
+	op = PIXMAN_OP_ADD;
+	break;
+    }
+
+    src_width = lcg_rand_n (MAX_SRC_WIDTH) + 1;
+    src_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1;
+
+    if (lcg_rand_n (2))
+    {
+	mask_width = lcg_rand_n (MAX_SRC_WIDTH) + 1;
+	mask_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1;
+    }
+    else
+    {
+	mask_width = mask_height = 1;
+    }
+
+    dst_width = lcg_rand_n (MAX_DST_WIDTH) + 1;
+    dst_height = lcg_rand_n (MAX_DST_HEIGHT) + 1;
+    src_stride = src_width * src_bpp + lcg_rand_n (MAX_STRIDE) * src_bpp;
+    mask_stride = mask_width * mask_bpp + lcg_rand_n (MAX_STRIDE) * mask_bpp;
+    dst_stride = dst_width * dst_bpp + lcg_rand_n (MAX_STRIDE) * dst_bpp;
+
+    if (src_stride & 3)
+	src_stride += 2;
+
+    if (mask_stride & 1)
+	mask_stride += 1;
+    if (mask_stride & 2)
+	mask_stride += 2;
+
+    if (dst_stride & 3)
+	dst_stride += 2;
+
+    src_x = -(src_width / 4) + lcg_rand_n (src_width * 3 / 2);
+    src_y = -(src_height / 4) + lcg_rand_n (src_height * 3 / 2);
+    mask_x = -(mask_width / 4) + lcg_rand_n (mask_width * 3 / 2);
+    mask_y = -(mask_height / 4) + lcg_rand_n (mask_height * 3 / 2);
+    dst_x = -(dst_width / 4) + lcg_rand_n (dst_width * 3 / 2);
+    dst_y = -(dst_height / 4) + lcg_rand_n (dst_height * 3 / 2);
+    w = lcg_rand_n (dst_width * 3 / 2 - dst_x);
+    h = lcg_rand_n (dst_height * 3 / 2 - dst_y);
+
+    srcbuf = (uint32_t *)malloc (src_stride * src_height);
+    maskbuf = (uint32_t *)malloc (mask_stride * mask_height);
+    dstbuf = (uint32_t *)malloc (dst_stride * dst_height);
+
+    for (i = 0; i < src_stride * src_height; i++)
+	*((uint8_t *)srcbuf + i) = lcg_rand_n (256);
+
+    for (i = 0; i < mask_stride * mask_height; i++)
+	*((uint8_t *)maskbuf + i) = lcg_rand_n (256);
+
+    for (i = 0; i < dst_stride * dst_height; i++)
+	*((uint8_t *)dstbuf + i) = lcg_rand_n (256);
+
+    src_fmt = src_bpp == 4 ? (lcg_rand_n (2) == 0 ?
+                              PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
+
+    dst_fmt = dst_bpp == 4 ? (lcg_rand_n (2) == 0 ?
+                              PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
+
+    src_img = pixman_image_create_bits (
+        src_fmt, src_width, src_height, srcbuf, src_stride);
+
+    mask_img = pixman_image_create_bits (
+        PIXMAN_a8, mask_width, mask_height, maskbuf, mask_stride);
+
+    dst_img = pixman_image_create_bits (
+        dst_fmt, dst_width, dst_height, dstbuf, dst_stride);
+
+    image_endian_swap (src_img, src_bpp * 8);
+    image_endian_swap (dst_img, dst_bpp * 8);
+
+    if (lcg_rand_n (4) > 0)
+    {
+	scale_x = -32768 * 3 + lcg_rand_N (65536 * 5);
+	scale_y = -32768 * 3 + lcg_rand_N (65536 * 5);
+	translate_x = lcg_rand_N (65536);
+	translate_y = lcg_rand_N (65536);
+	pixman_transform_init_scale (&transform, scale_x, scale_y);
+	pixman_transform_translate (&transform, NULL, translate_x, translate_y);
+	pixman_image_set_transform (src_img, &transform);
+    }
+
+    if (lcg_rand_n (2) > 0)
+    {
+	mask_scale_x = -32768 * 3 + lcg_rand_N (65536 * 5);
+	mask_scale_y = -32768 * 3 + lcg_rand_N (65536 * 5);
+	mask_translate_x = lcg_rand_N (65536);
+	mask_translate_y = lcg_rand_N (65536);
+	pixman_transform_init_scale (&transform, mask_scale_x, mask_scale_y);
+	pixman_transform_translate (&transform, NULL, mask_translate_x, mask_translate_y);
+	pixman_image_set_transform (mask_img, &transform);
+    }
+
+    switch (lcg_rand_n (4))
+    {
+    case 0:
+	mask_repeat = PIXMAN_REPEAT_NONE;
+	break;
+
+    case 1:
+	mask_repeat = PIXMAN_REPEAT_NORMAL;
+	break;
+
+    case 2:
+	mask_repeat = PIXMAN_REPEAT_PAD;
+	break;
+
+    case 3:
+	mask_repeat = PIXMAN_REPEAT_REFLECT;
+	break;
+
+    default:
+        break;
+    }
+    pixman_image_set_repeat (mask_img, mask_repeat);
+
+    switch (lcg_rand_n (4))
+    {
+    case 0:
+	repeat = PIXMAN_REPEAT_NONE;
+	break;
+
+    case 1:
+	repeat = PIXMAN_REPEAT_NORMAL;
+	break;
+
+    case 2:
+	repeat = PIXMAN_REPEAT_PAD;
+	break;
+
+    case 3:
+	repeat = PIXMAN_REPEAT_REFLECT;
+	break;
+
+    default:
+        break;
+    }
+    pixman_image_set_repeat (src_img, repeat);
+
+    if (lcg_rand_n (2))
+	pixman_image_set_filter (src_img, PIXMAN_FILTER_NEAREST, NULL, 0);
+    else
+	pixman_image_set_filter (src_img, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+    if (lcg_rand_n (2))
+	pixman_image_set_filter (mask_img, PIXMAN_FILTER_NEAREST, NULL, 0);
+    else
+	pixman_image_set_filter (mask_img, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+    if (verbose)
+    {
+	printf ("src_fmt=%08X, dst_fmt=%08X\n", src_fmt, dst_fmt);
+	printf ("op=%d, scale_x=%d, scale_y=%d, repeat=%d\n",
+	        op, scale_x, scale_y, repeat);
+	printf ("translate_x=%d, translate_y=%d\n",
+	        translate_x, translate_y);
+	printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
+	        src_width, src_height, dst_width, dst_height);
+	printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
+	        src_x, src_y, dst_x, dst_y);
+	printf ("w=%d, h=%d\n", w, h);
+    }
+
+    if (lcg_rand_n (8) == 0)
+    {
+	pixman_box16_t clip_boxes[2];
+	int            n = lcg_rand_n (2) + 1;
+
+	for (i = 0; i < n; i++)
+	{
+	    clip_boxes[i].x1 = lcg_rand_n (src_width);
+	    clip_boxes[i].y1 = lcg_rand_n (src_height);
+	    clip_boxes[i].x2 =
+		clip_boxes[i].x1 + lcg_rand_n (src_width - clip_boxes[i].x1);
+	    clip_boxes[i].y2 =
+		clip_boxes[i].y1 + lcg_rand_n (src_height - clip_boxes[i].y1);
+
+	    if (verbose)
+	    {
+		printf ("source clip box: [%d,%d-%d,%d]\n",
+		        clip_boxes[i].x1, clip_boxes[i].y1,
+		        clip_boxes[i].x2, clip_boxes[i].y2);
+	    }
+	}
+
+	pixman_region_init_rects (&clip, clip_boxes, n);
+	pixman_image_set_clip_region (src_img, &clip);
+	pixman_image_set_source_clipping (src_img, 1);
+	pixman_region_fini (&clip);
+    }
+
+    if (lcg_rand_n (8) == 0)
+    {
+	pixman_box16_t clip_boxes[2];
+	int            n = lcg_rand_n (2) + 1;
+
+	for (i = 0; i < n; i++)
+	{
+	    clip_boxes[i].x1 = lcg_rand_n (mask_width);
+	    clip_boxes[i].y1 = lcg_rand_n (mask_height);
+	    clip_boxes[i].x2 =
+		clip_boxes[i].x1 + lcg_rand_n (mask_width - clip_boxes[i].x1);
+	    clip_boxes[i].y2 =
+		clip_boxes[i].y1 + lcg_rand_n (mask_height - clip_boxes[i].y1);
+
+	    if (verbose)
+	    {
+		printf ("mask clip box: [%d,%d-%d,%d]\n",
+		        clip_boxes[i].x1, clip_boxes[i].y1,
+		        clip_boxes[i].x2, clip_boxes[i].y2);
+	    }
+	}
+
+	pixman_region_init_rects (&clip, clip_boxes, n);
+	pixman_image_set_clip_region (mask_img, &clip);
+	pixman_image_set_source_clipping (mask_img, 1);
+	pixman_region_fini (&clip);
+    }
+
+    if (lcg_rand_n (8) == 0)
+    {
+	pixman_box16_t clip_boxes[2];
+	int            n = lcg_rand_n (2) + 1;
+	for (i = 0; i < n; i++)
+	{
+	    clip_boxes[i].x1 = lcg_rand_n (dst_width);
+	    clip_boxes[i].y1 = lcg_rand_n (dst_height);
+	    clip_boxes[i].x2 =
+		clip_boxes[i].x1 + lcg_rand_n (dst_width - clip_boxes[i].x1);
+	    clip_boxes[i].y2 =
+		clip_boxes[i].y1 + lcg_rand_n (dst_height - clip_boxes[i].y1);
+
+	    if (verbose)
+	    {
+		printf ("destination clip box: [%d,%d-%d,%d]\n",
+		        clip_boxes[i].x1, clip_boxes[i].y1,
+		        clip_boxes[i].x2, clip_boxes[i].y2);
+	    }
+	}
+	pixman_region_init_rects (&clip, clip_boxes, n);
+	pixman_image_set_clip_region (dst_img, &clip);
+	pixman_region_fini (&clip);
+    }
+
+    if (lcg_rand_n (2) == 0)
+	pixman_image_composite (op, src_img, NULL, dst_img,
+                            src_x, src_y, 0, 0, dst_x, dst_y, w, h);
+    else
+	pixman_image_composite (op, src_img, mask_img, dst_img,
+                            src_x, src_y, mask_x, mask_y, dst_x, dst_y, w, h);
+
+    if (dst_fmt == PIXMAN_x8r8g8b8)
+    {
+	/* ignore unused part */
+	for (i = 0; i < dst_stride * dst_height / 4; i++)
+	    dstbuf[i] &= 0xFFFFFF;
+    }
+
+    image_endian_swap (dst_img, dst_bpp * 8);
+
+    if (verbose)
+    {
+	int j;
+	
+	for (i = 0; i < dst_height; i++)
+	{
+	    for (j = 0; j < dst_stride; j++)
+		printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
+
+	    printf ("\n");
+	}
+    }
+
+    pixman_image_unref (src_img);
+    pixman_image_unref (mask_img);
+    pixman_image_unref (dst_img);
+
+    crc32 = compute_crc32 (0, dstbuf, dst_stride * dst_height);
+    free (srcbuf);
+    free (maskbuf);
+    free (dstbuf);
+
+    FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    pixman_disable_out_of_bounds_workaround ();
+
+    return fuzzer_test_main("scaling", 8000000, 0x80DF1CB2,
+			    test_composite, argc, argv);
+}
diff --git a/pixman/test/screen-test.c b/pixman/test/screen-test.c
deleted file mode 100644
index e69dba3de..000000000
--- a/pixman/test/screen-test.c
+++ /dev/null
@@ -1,44 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "pixman.h"
-#include "gtk-utils.h"
-
-int
-main (int argc, char **argv)
-{
-#define WIDTH 40
-#define HEIGHT 40
-    
-    uint32_t *src1 = malloc (WIDTH * HEIGHT * 4);
-    uint32_t *src2 = malloc (WIDTH * HEIGHT * 4);
-    uint32_t *src3 = malloc (WIDTH * HEIGHT * 4);
-    uint32_t *dest = malloc (3 * WIDTH * 2 * HEIGHT * 4);
-    pixman_image_t *simg1, *simg2, *simg3, *dimg;
-
-    int i;
-
-    for (i = 0; i < WIDTH * HEIGHT; ++i)
-    {
-	src1[i] = 0x7ff00000;
-	src2[i] = 0x7f00ff00;
-	src3[i] = 0x7f0000ff;
-    }
-
-    for (i = 0; i < 3 * WIDTH * 2 * HEIGHT; ++i)
-    {
-	dest[i] = 0x0;
-    }
-
-    simg1 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src1, WIDTH * 4);
-    simg2 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src2, WIDTH * 4);
-    simg3 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src3, WIDTH * 4);
-    dimg  = pixman_image_create_bits (PIXMAN_a8r8g8b8, 3 * WIDTH, 2 * HEIGHT, dest, 3 * WIDTH * 4);
-
-    pixman_image_composite (PIXMAN_OP_SCREEN, simg1, NULL, dimg, 0, 0, 0, 0, WIDTH, HEIGHT / 4, WIDTH, HEIGHT);
-    pixman_image_composite (PIXMAN_OP_SCREEN, simg2, NULL, dimg, 0, 0, 0, 0, (WIDTH/2), HEIGHT / 4 + HEIGHT / 2, WIDTH, HEIGHT);
-    pixman_image_composite (PIXMAN_OP_SCREEN, simg3, NULL, dimg, 0, 0, 0, 0, (4 * WIDTH) / 3, HEIGHT, WIDTH, HEIGHT);
-
-    show_image (dimg);
-    
-    return 0;
-}
diff --git a/pixman/test/trap-test.c b/pixman/test/trap-test.c
deleted file mode 100644
index 19295e7a5..000000000
--- a/pixman/test/trap-test.c
+++ /dev/null
@@ -1,49 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "pixman.h"
-#include "gtk-utils.h"
-
-int
-main (int argc, char **argv)
-{
-#define WIDTH 200
-#define HEIGHT 200
-
-    pixman_image_t *src_img;
-    pixman_image_t *mask_img;
-    pixman_image_t *dest_img;
-    pixman_trap_t trap;
-    pixman_color_t white = { 0x0000, 0xffff, 0x0000, 0xffff };
-    uint32_t *bits = malloc (WIDTH * HEIGHT * 4);
-    uint32_t *mbits = malloc (WIDTH * HEIGHT);
-
-    memset (mbits, 0, WIDTH * HEIGHT);
-    memset (bits, 0xff, WIDTH * HEIGHT * 4);
-    
-    trap.top.l = pixman_int_to_fixed (50) + 0x8000;
-    trap.top.r = pixman_int_to_fixed (150) + 0x8000;
-    trap.top.y = pixman_int_to_fixed (30);
-
-    trap.bot.l = pixman_int_to_fixed (50) + 0x8000;
-    trap.bot.r = pixman_int_to_fixed (150) + 0x8000;
-    trap.bot.y = pixman_int_to_fixed (150);
-
-    mask_img = pixman_image_create_bits (PIXMAN_a8, WIDTH, HEIGHT, mbits, WIDTH);
-    src_img = pixman_image_create_solid_fill (&white);
-    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, WIDTH * 4);
-    
-    pixman_add_traps (mask_img, 0, 0, 1, &trap);
-
-    pixman_image_composite (PIXMAN_OP_OVER,
-			    src_img, mask_img, dest_img,
-			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
-    
-    show_image (dest_img);
-    
-    pixman_image_unref (src_img);
-    pixman_image_unref (dest_img);
-    free (bits);
-    
-    return 0;
-}
-- 
cgit v1.2.3