112 files changed, 5425 insertions, 991 deletions
diff --git a/mesalib/configure.ac b/mesalib/configure.ac
index e769edadb..9cc5c4ae5 100644
--- a/mesalib/configure.ac
+++ b/mesalib/configure.ac
@@ -608,8 +608,10 @@ AC_ARG_ENABLE([vdpau],
    [enable_vdpau=auto])
 AC_ARG_ENABLE([opencl],
    [AS_HELP_STRING([--enable-opencl],
-         [enable OpenCL library @<:@default=no@:>@])],
-   [enable_opencl="$enableval"],
+         [enable OpenCL library NOTE: Enabling this option will also enable
+          --with-llvm-shared-libs
+          @<:@default=no@:>@])],
+   [enable_opencl="$enableval" with_llvm_shared_libs="$enableval"],
    [enable_opencl=no])
 AC_ARG_ENABLE([xlib_glx],
     [AS_HELP_STRING([--enable-xlib-glx],
@@ -1660,10 +1662,7 @@ if test "x$enable_gallium_llvm" = xyes; then
     if test "x$LLVM_CONFIG" != xno; then
 	LLVM_VERSION=`$LLVM_CONFIG --version | sed 's/svn.*//g'`
 	LLVM_VERSION_INT=`echo $LLVM_VERSION | sed -e 's/\([[0-9]]\)\.\([[0-9]]\)/\10\2/g'`
-        if test "x$with_llvm_shared_libs" = xyes; then
-	    dnl We can't use $LLVM_VERSION because it has 'svn' stripped out,
-	    LLVM_LIBS="-lLLVM-`$LLVM_CONFIG --version`"
-	else
+        if test "x$with_llvm_shared_libs" != xyes; then
             LLVM_COMPONENTS="engine bitwriter"
             if $LLVM_CONFIG --components | grep -q '\<mcjit\>'; then
                 LLVM_COMPONENTS="${LLVM_COMPONENTS} mcjit"
@@ -1672,7 +1671,6 @@ if test "x$enable_gallium_llvm" = xyes; then
             if test "x$enable_opencl" = xyes; then
                 LLVM_COMPONENTS="${LLVM_COMPONENTS} ipo linker instrumentation"
             fi
-            LLVM_LIBS="`$LLVM_CONFIG --libs ${LLVM_COMPONENTS}`"
 	fi
 	LLVM_LDFLAGS=`$LLVM_CONFIG --ldflags`
 	LLVM_BINDIR=`$LLVM_CONFIG --bindir`
@@ -1797,7 +1795,7 @@ radeon_llvm_check() {
                       configure flag])
     fi
     AC_MSG_WARN([Please ensure you use the latest llvm tree from git://people.freedesktop.org/~tstellar/llvm master before submitting a bug])
-    LLVM_LIBS="$LLVM_LIBS `$LLVM_CONFIG --libs r600`"
+    LLVM_COMPONENTS="${LLVM_COMPONENTS} r600"
 }
 
 dnl Gallium drivers
@@ -1836,12 +1834,13 @@ if test "x$with_gallium_drivers" != x; then
             if test "x$enable_r600_llvm" = xyes -o "x$enable_opencl" = xyes; then
                 radeon_llvm_check
                 NEED_RADEON_GALLIUM=yes;
+                LLVM_COMPONENTS="${LLVM_COMPONENTS} ipo"
             fi
             if test "x$enable_r600_llvm" = xyes; then
                 USE_R600_LLVM_COMPILER=yes;
             fi
             if test "x$enable_opencl" = xyes -a "x$with_llvm_shared_libs" = xno; then
-                LLVM_LIBS="${LLVM_LIBS} `$LLVM_CONFIG --libs bitreader asmparser`"
+                LLVM_COMPONENTS="${LLVM_COMPONENTS} bitreader asmparser"
             fi
             gallium_check_st "radeon/drm" "dri-r600" "xorg-r600" "" "xvmc-r600" "vdpau-r600"
             ;;
@@ -1891,6 +1890,50 @@ if test "x$with_gallium_drivers" != x; then
         esac
     done
 fi
+
+dnl Set LLVM_LIBS - This is done after the driver configuration so
+dnl that drivers can add additonal components to LLVM_COMPONENTS.
+dnl Previously, gallium drivers were updating LLVM_LIBS directly
+dnl by calling llvm-config --libs ${DRIVER_LLVM_COMPONENTS}, but
+dnl this was causing the same libraries to be appear multiple times
+dnl in LLVM_LIBS.
+
+if test "x$MESA_LLVM" != x0; then
+
+    LLVM_LIBS="`$LLVM_CONFIG --libs ${LLVM_COMPONENTS}`"
+
+    if test "x$with_llvm_shared_libs" = xyes; then
+        dnl We can't use $LLVM_VERSION because it has 'svn' stripped out,
+        LLVM_SO_NAME=LLVM-`$LLVM_CONFIG --version`
+        AC_CHECK_FILE("$LLVM_LIBDIR/lib$LLVM_SO_NAME.so", llvm_have_one_so=yes,)
+
+        if test "x$llvm_have_one_so" = xyes; then
+            dnl LLVM was built using auto*, so there is only one shared object.
+            LLVM_LIBS="-l$LLVM_SO_NAME"
+        else
+            dnl If LLVM was built with CMake, there will be one shared object per
+            dnl component.
+            AC_CHECK_FILE("$LLVM_LIBDIR/libLLVMTarget.so",,
+                    AC_MSG_ERROR([Could not find llvm shared libraries:
+	Please make sure you have built llvm with the --enable-shared option
+	and that your llvm libraries are installed in $LLVM_LIBDIR
+	If you have installed your llvm libraries to a different directory you
+	can use the --with-llvm-prefix= configure flag to specify this directory.
+	NOTE: Mesa is attempting to use llvm shared libraries because you have
+	passed one of the following options to configure:
+		--with-llvm-shared-libs
+		--enable-opencl
+	If you do not want to build with llvm shared libraries and instead want to
+	use llvm static libraries then remove these options from your configure
+	invocation and reconfigure.]))
+
+           dnl We don't need to update LLVM_LIBS in this case because the LLVM
+           dnl install uses a shared object for each compoenent and we have
+           dnl already added all of these objects to LLVM_LIBS.
+        fi
+    fi
+fi
+
 AM_CONDITIONAL(HAVE_GALLIUM_SVGA, test "x$HAVE_GALLIUM_SVGA" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_I915, test "x$HAVE_GALLIUM_I915" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_R300, test "x$HAVE_GALLIUM_R300" = xyes)
diff --git a/mesalib/include/GLES3/gl3.h b/mesalib/include/GLES3/gl3.h
index b9399e994..09f2b5333 100644
--- a/mesalib/include/GLES3/gl3.h
+++ b/mesalib/include/GLES3/gl3.h
@@ -2,7 +2,7 @@
 #define __gl3_h_
 
 /* 
- * gl3.h last updated on $Date: 2012-09-12 10:13:02 -0700 (Wed, 12 Sep 2012) $
+ * gl3.h last updated on $Date: 2012-10-03 07:52:40 -0700 (Wed, 03 Oct 2012) $
  */
 
 #include <GLES3/gl3platform.h>
@@ -796,7 +796,7 @@ typedef struct __GLsync *GLsync;
 #define GL_TEXTURE_IMMUTABLE_FORMAT                      0x912F
 #define GL_MAX_ELEMENT_INDEX                             0x8D6B
 #define GL_NUM_SAMPLE_COUNTS                             0x9380
-#define GL_TEXTURE_IMMUTABLE_LEVELS                      0x8D63
+#define GL_TEXTURE_IMMUTABLE_LEVELS                      0x82DF
 
 /*-------------------------------------------------------------------------
  * Entrypoint definitions
diff --git a/mesalib/src/gallium/auxiliary/Makefile.am b/mesalib/src/gallium/auxiliary/Makefile.am
index 49792930a..a4eee4773 100644
--- a/mesalib/src/gallium/auxiliary/Makefile.am
+++ b/mesalib/src/gallium/auxiliary/Makefile.am
@@ -45,9 +45,3 @@ util/u_format_srgb.c: $(srcdir)/util/u_format_srgb.py
 
 util/u_format_table.c: $(srcdir)/util/u_format_table.py $(srcdir)/util/u_format_pack.py $(srcdir)/util/u_format_parse.py $(srcdir)/util/u_format.csv
 	$(AM_V_GEN) $(PYTHON2) $(srcdir)/util/u_format_table.py $(srcdir)/util/u_format.csv > $@
-
-# XXX: As a work around for https://bugs.freedesktop.org/show_bug.cgi?id=59334
-# clover needs to link against libgallium.a. Delete this once we have a real
-# fix for this bug.
-all-local: libgallium.la
-	ln -f $(builddir)/.libs/libgallium.a $(builddir)/libgallium.a
diff --git a/mesalib/src/gallium/auxiliary/util/u_debug.c b/mesalib/src/gallium/auxiliary/util/u_debug.c
index 6e8c5b993..f4670f28c 100644
--- a/mesalib/src/gallium/auxiliary/util/u_debug.c
+++ b/mesalib/src/gallium/auxiliary/util/u_debug.c
@@ -232,7 +232,7 @@ debug_get_flags_option(const char *name,
    unsigned long result;
    const char *str;
    const struct debug_named_value *orig = flags;
-   int namealign = 0;
+   unsigned namealign = 0;
    
    str = os_get_option(name);
    if(!str)
diff --git a/mesalib/src/gallium/auxiliary/util/u_tile.c b/mesalib/src/gallium/auxiliary/util/u_tile.c
index 6c618a674..62298cdab 100644
--- a/mesalib/src/gallium/auxiliary/util/u_tile.c
+++ b/mesalib/src/gallium/auxiliary/util/u_tile.c
@@ -806,7 +806,7 @@ pipe_put_tile_z(struct pipe_transfer *pt,
             for (j = 0; j < w; j++) {
                /* convert 32-bit integer Z to float Z */
                const double scale = 1.0 / 0xffffffffU;
-               pDest[j] = ptrc[j] * scale;
+               pDest[j] = (float) (ptrc[j] * scale);
             }
             pDest += pt->stride/4;
             ptrc += srcStride;
@@ -820,7 +820,7 @@ pipe_put_tile_z(struct pipe_transfer *pt,
             for (j = 0; j < w; j++) {
                /* convert 32-bit integer Z to float Z */
                const double scale = 1.0 / 0xffffffffU;
-               pDest[j*2] = ptrc[j] * scale;
+               pDest[j*2] = (float) (ptrc[j] * scale);
             }
             pDest += pt->stride/4;
             ptrc += srcStride;
diff --git a/mesalib/src/gallium/auxiliary/util/u_tile.h b/mesalib/src/gallium/auxiliary/util/u_tile.h
index abcd402c8..9e8194459 100644
--- a/mesalib/src/gallium/auxiliary/util/u_tile.h
+++ b/mesalib/src/gallium/auxiliary/util/u_tile.h
@@ -45,13 +45,13 @@ struct pipe_transfer;
 static INLINE boolean
 u_clip_tile(uint x, uint y, uint *w, uint *h, const struct pipe_box *box)
 {
-   if (x >= box->width)
+   if ((int) x >= box->width)
       return TRUE;
-   if (y >= box->height)
+   if ((int) y >= box->height)
       return TRUE;
-   if (x + *w > box->width)
+   if ((int) (x + *w) > box->width)
       *w = box->width - x;
-   if (y + *h > box->height)
+   if ((int) (y + *h) > box->height)
       *h = box->height - y;
    return FALSE;
 }
diff --git a/mesalib/src/gallium/auxiliary/util/u_upload_mgr.c b/mesalib/src/gallium/auxiliary/util/u_upload_mgr.c
index ee1c6881e..6859751c5 100644
--- a/mesalib/src/gallium/auxiliary/util/u_upload_mgr.c
+++ b/mesalib/src/gallium/auxiliary/util/u_upload_mgr.c
@@ -163,6 +163,13 @@ enum pipe_error u_upload_alloc( struct u_upload_mgr *upload,
    unsigned alloc_offset = align(min_out_offset, upload->alignment);
    unsigned offset;
 
+   /* Init these return values here in case we fail below to make
+    * sure the caller doesn't get garbage values.
+    */
+   *out_offset = ~0;
+   pipe_resource_reference(outbuf, NULL);
+   *ptr = NULL;
+
    /* Make sure we have enough space in the upload buffer
     * for the sub-allocation. */
    if (MAX2(upload->offset, alloc_offset) + alloc_size > upload->size) {
@@ -182,8 +189,6 @@ enum pipe_error u_upload_alloc( struct u_upload_mgr *upload,
 					  PIPE_TRANSFER_UNSYNCHRONIZED,
 					  &upload->transfer);
       if (!upload->map) {
-         pipe_resource_reference(outbuf, NULL);
-         *ptr = NULL;
          upload->transfer = NULL;
          return PIPE_ERROR_OUT_OF_MEMORY;
       }
diff --git a/mesalib/src/gallium/auxiliary/util/u_vbuf.c b/mesalib/src/gallium/auxiliary/util/u_vbuf.c
index b712b52de..244b04d2a 100644
--- a/mesalib/src/gallium/auxiliary/util/u_vbuf.c
+++ b/mesalib/src/gallium/auxiliary/util/u_vbuf.c
@@ -323,7 +323,7 @@ void u_vbuf_destroy(struct u_vbuf *mgr)
    FREE(mgr);
 }
 
-static void
+static enum pipe_error
 u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
                          unsigned vb_mask, unsigned out_vb,
                          int start_vertex, unsigned num_vertices,
@@ -335,6 +335,7 @@ u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
    struct pipe_resource *out_buffer = NULL;
    uint8_t *out_map;
    unsigned out_offset, mask;
+   enum pipe_error err;
 
    /* Get a translate object. */
    tr = translate_cache_find(mgr->translate_cache, key);
@@ -381,6 +382,14 @@ u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
 
       assert((ib->buffer || ib->user_buffer) && ib->index_size);
 
+      /* Create and map the output buffer. */
+      err = u_upload_alloc(mgr->uploader, 0,
+                           key->output_stride * num_indices,
+                           &out_offset, &out_buffer,
+                           (void**)&out_map);
+      if (err != PIPE_OK)
+         return err;
+
       if (ib->user_buffer) {
          map = (uint8_t*)ib->user_buffer + offset;
       } else {
@@ -389,12 +398,6 @@ u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
                                      PIPE_TRANSFER_READ, &transfer);
       }
 
-      /* Create and map the output buffer. */
-      u_upload_alloc(mgr->uploader, 0,
-                     key->output_stride * num_indices,
-                     &out_offset, &out_buffer,
-                     (void**)&out_map);
-
       switch (ib->index_size) {
       case 4:
          tr->run_elts(tr, (unsigned*)map, num_indices, 0, out_map);
@@ -412,11 +415,13 @@ u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
       }
    } else {
       /* Create and map the output buffer. */
-      u_upload_alloc(mgr->uploader,
-                     key->output_stride * start_vertex,
-                     key->output_stride * num_vertices,
-                     &out_offset, &out_buffer,
-                     (void**)&out_map);
+      err = u_upload_alloc(mgr->uploader,
+                           key->output_stride * start_vertex,
+                           key->output_stride * num_vertices,
+                           &out_offset, &out_buffer,
+                           (void**)&out_map);
+      if (err != PIPE_OK)
+         return err;
 
       out_offset -= key->output_stride * start_vertex;
 
@@ -441,6 +446,8 @@ u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
    pipe_resource_reference(
       &mgr->real_vertex_buffer[out_vb].buffer, NULL);
    mgr->real_vertex_buffer[out_vb].buffer = out_buffer;
+
+   return PIPE_OK;
 }
 
 static boolean
@@ -588,11 +595,14 @@ u_vbuf_translate_begin(struct u_vbuf *mgr,
    /* Translate buffers. */
    for (type = 0; type < VB_NUM; type++) {
       if (key[type].nr_elements) {
-         u_vbuf_translate_buffers(mgr, &key[type], mask[type],
-                                  mgr->fallback_vbs[type],
-                                  start[type], num[type],
-                                  start_index, num_indices, min_index,
-                                  unroll_indices && type == VB_VERTEX);
+         enum pipe_error err;
+         err = u_vbuf_translate_buffers(mgr, &key[type], mask[type],
+                                        mgr->fallback_vbs[type],
+                                        start[type], num[type],
+                                        start_index, num_indices, min_index,
+                                        unroll_indices && type == VB_VERTEX);
+         if (err != PIPE_OK)
+            return FALSE;
 
          /* Fixup the stride for constant attribs. */
          if (type == VB_CONST) {
@@ -884,7 +894,7 @@ void u_vbuf_set_index_buffer(struct u_vbuf *mgr,
    pipe->set_index_buffer(pipe, ib);
 }
 
-static void
+static enum pipe_error
 u_vbuf_upload_buffers(struct u_vbuf *mgr,
                       int start_vertex, unsigned num_vertices,
                       int start_instance, unsigned num_instances)
@@ -953,6 +963,7 @@ u_vbuf_upload_buffers(struct u_vbuf *mgr,
       unsigned start, end;
       struct pipe_vertex_buffer *real_vb;
       const uint8_t *ptr;
+      enum pipe_error err;
 
       i = u_bit_scan(&buffer_mask);
 
@@ -963,11 +974,15 @@ u_vbuf_upload_buffers(struct u_vbuf *mgr,
       real_vb = &mgr->real_vertex_buffer[i];
       ptr = mgr->vertex_buffer[i].user_buffer;
 
-      u_upload_data(mgr->uploader, start, end - start, ptr + start,
-                    &real_vb->buffer_offset, &real_vb->buffer);
+      err = u_upload_data(mgr->uploader, start, end - start, ptr + start,
+                          &real_vb->buffer_offset, &real_vb->buffer);
+      if (err != PIPE_OK)
+         return err;
 
       real_vb->buffer_offset -= start;
    }
+
+   return PIPE_OK;
 }
 
 static boolean u_vbuf_need_minmax_index(struct u_vbuf *mgr)
@@ -1176,11 +1191,13 @@ void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)
    if (unroll_indices ||
        incompatible_vb_mask ||
        mgr->ve->incompatible_elem_mask) {
-      /* XXX check the return value */
-      u_vbuf_translate_begin(mgr, start_vertex, num_vertices,
-                             info->start_instance, info->instance_count,
-                             info->start, info->count, min_index,
-                             unroll_indices);
+      if (!u_vbuf_translate_begin(mgr, start_vertex, num_vertices,
+                                  info->start_instance, info->instance_count,
+                                  info->start, info->count, min_index,
+                                  unroll_indices)) {
+         debug_warn_once("u_vbuf_translate_begin() failed");
+         return;
+      }
 
       user_vb_mask &= ~(incompatible_vb_mask |
                         mgr->ve->incompatible_vb_mask_all);
@@ -1188,8 +1205,13 @@ void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)
 
    /* Upload user buffers. */
    if (user_vb_mask) {
-      u_vbuf_upload_buffers(mgr, start_vertex, num_vertices,
-                            info->start_instance, info->instance_count);
+      if (u_vbuf_upload_buffers(mgr, start_vertex, num_vertices,
+                                info->start_instance,
+                                info->instance_count) != PIPE_OK) {
+         debug_warn_once("u_vbuf_upload_buffers() failed");
+         return;
+      }
+
       mgr->dirty_real_vb_mask |= user_vb_mask;
    }
 
diff --git a/mesalib/src/glsl/Makefile.am b/mesalib/src/glsl/Makefile.am
index 058d8aed3..d0e5cd1d0 100644
--- a/mesalib/src/glsl/Makefile.am
+++ b/mesalib/src/glsl/Makefile.am
@@ -52,6 +52,7 @@ check_PROGRAMS =					\
 
 tests_uniform_initializer_test_SOURCES =		\
 	$(top_srcdir)/src/mesa/main/hash_table.c	\
+	$(top_srcdir)/src/mesa/main/imports.c		\
 	$(top_srcdir)/src/mesa/program/prog_hash_table.c\
 	$(top_srcdir)/src/mesa/program/symbol_table.c	\
 	tests/copy_constant_to_storage_tests.cpp	\
@@ -100,6 +101,7 @@ endif
 
 glsl_test_SOURCES = \
 	$(top_srcdir)/src/mesa/main/hash_table.c \
+	$(top_srcdir)/src/mesa/main/imports.c \
 	$(top_srcdir)/src/mesa/program/prog_hash_table.c \
 	$(top_srcdir)/src/mesa/program/symbol_table.c \
 	$(GLSL_SRCDIR)/standalone_scaffolding.cpp \
diff --git a/mesalib/src/glsl/Makefile.sources b/mesalib/src/glsl/Makefile.sources
index de63c3246..c294aa429 100644
--- a/mesalib/src/glsl/Makefile.sources
+++ b/mesalib/src/glsl/Makefile.sources
@@ -47,6 +47,8 @@ LIBGLSL_FILES = \
 	$(GLSL_SRCDIR)/link_functions.cpp \
 	$(GLSL_SRCDIR)/link_uniforms.cpp \
 	$(GLSL_SRCDIR)/link_uniform_initializers.cpp \
+	$(GLSL_SRCDIR)/link_uniform_block_active_visitor.cpp \
+	$(GLSL_SRCDIR)/link_uniform_blocks.cpp \
 	$(GLSL_SRCDIR)/link_varyings.cpp \
 	$(GLSL_SRCDIR)/loop_analysis.cpp \
 	$(GLSL_SRCDIR)/loop_controls.cpp \
@@ -60,6 +62,7 @@ LIBGLSL_FILES = \
 	$(GLSL_SRCDIR)/lower_mat_op_to_vec.cpp \
 	$(GLSL_SRCDIR)/lower_noise.cpp \
 	$(GLSL_SRCDIR)/lower_packed_varyings.cpp \
+	$(GLSL_SRCDIR)/lower_packing_builtins.cpp \
 	$(GLSL_SRCDIR)/lower_texture_projection.cpp \
 	$(GLSL_SRCDIR)/lower_variable_index_to_cond_assign.cpp \
 	$(GLSL_SRCDIR)/lower_vec_index_to_cond_assign.cpp \
diff --git a/mesalib/src/glsl/SConscript b/mesalib/src/glsl/SConscript
index 6981f041b..c4ab97c1e 100644
--- a/mesalib/src/glsl/SConscript
+++ b/mesalib/src/glsl/SConscript
@@ -59,6 +59,7 @@ else:
     # Copy these files to avoid generation object files into src/mesa/program
     env.Prepend(CPPPATH = ['#src/mesa/main'])
     env.Command('hash_table.c', '#src/mesa/main/hash_table.c', Copy('$TARGET', '$SOURCE'))
+    env.Command('imports.c', '#src/mesa/main/imports.c', Copy('$TARGET', '$SOURCE'))
     # Copy these files to avoid generation object files into src/mesa/program
     env.Prepend(CPPPATH = ['#src/mesa/program'])
     env.Command('prog_hash_table.c', '#src/mesa/program/prog_hash_table.c', Copy('$TARGET', '$SOURCE'))
@@ -68,6 +69,7 @@ else:
 
     mesa_objs = env.StaticObject([
         'hash_table.c',
+        'imports.c',
         'prog_hash_table.c',
         'symbol_table.c',
     ])
diff --git a/mesalib/src/glsl/ast.h b/mesalib/src/glsl/ast.h
index 50747822d..1a28963c4 100644
--- a/mesalib/src/glsl/ast.h
+++ b/mesalib/src/glsl/ast.h
@@ -804,11 +804,12 @@ public:
 class ast_uniform_block : public ast_node {
 public:
    ast_uniform_block(ast_type_qualifier layout,
-		     const char *block_name,
-		     ast_declarator_list *member_list)
-   : layout(layout), block_name(block_name)
+                     const char *instance_name,
+		     ast_expression *array_size)
+   : layout(layout), block_name(NULL), instance_name(instance_name),
+     array_size(array_size)
    {
-      declarations.push_degenerate_list_at_head(&member_list->link);
+      /* empty */
    }
 
    virtual ir_rvalue *hir(exec_list *instructions,
@@ -816,8 +817,28 @@ public:
 
    ast_type_qualifier layout;
    const char *block_name;
+
+   /**
+    * Declared name of the block instance, if specified.
+    *
+    * If the block does not have an instance name, this field will be
+    * \c NULL.
+    */
+   const char *instance_name;
+
    /** List of ast_declarator_list * */
    exec_list declarations;
+
+   /**
+    * Declared array size of the block instance
+    *
+    * If the block is not declared as an array, this field will be \c NULL.
+    *
+    * \note
+    * A block can only be an array if it also has an instance name.  If this
+    * field is not \c NULL, ::instance_name must also not be \c NULL.
+    */
+   ast_expression *array_size;
 };
 /*@}*/
 
diff --git a/mesalib/src/glsl/ast_function.cpp b/mesalib/src/glsl/ast_function.cpp
index dc7a58bf2..26f72cf8e 100644
--- a/mesalib/src/glsl/ast_function.cpp
+++ b/mesalib/src/glsl/ast_function.cpp
@@ -132,12 +132,13 @@ verify_parameter_modes(_mesa_glsl_parse_state *state,
       }
 
       /* Verify that 'out' and 'inout' actual parameters are lvalues. */
-      if (formal->mode == ir_var_out || formal->mode == ir_var_inout) {
+      if (formal->mode == ir_var_function_out
+          || formal->mode == ir_var_function_inout) {
 	 const char *mode = NULL;
 	 switch (formal->mode) {
-	 case ir_var_out:   mode = "out";   break;
-	 case ir_var_inout: mode = "inout"; break;
-	 default:           assert(false);  break;
+	 case ir_var_function_out:   mode = "out";   break;
+	 case ir_var_function_inout: mode = "inout"; break;
+	 default:                    assert(false);  break;
 	 }
 
 	 /* This AST-based check catches errors like f(i++).  The IR-based
@@ -210,13 +211,13 @@ generate_call(exec_list *instructions, ir_function_signature *sig,
       if (formal->type->is_numeric() || formal->type->is_boolean()) {
 	 switch (formal->mode) {
 	 case ir_var_const_in:
-	 case ir_var_in: {
+	 case ir_var_function_in: {
 	    ir_rvalue *converted
 	       = convert_component(actual, formal->type);
 	    actual->replace_with(converted);
 	    break;
 	 }
-	 case ir_var_out:
+	 case ir_var_function_out:
 	    if (actual->type != formal->type) {
 	       /* To convert an out parameter, we need to create a
 		* temporary variable to hold the value before conversion,
@@ -254,7 +255,7 @@ generate_call(exec_list *instructions, ir_function_signature *sig,
 	       actual->replace_with(deref_tmp_2);
 	    }
 	    break;
-	 case ir_var_inout:
+	 case ir_var_function_inout:
 	    /* Inout parameters should never require conversion, since that
 	     * would require an implicit conversion to exist both to and
 	     * from the formal parameter type, and there are no
diff --git a/mesalib/src/glsl/ast_to_hir.cpp b/mesalib/src/glsl/ast_to_hir.cpp
index de3ce902e..49093d88f 100644
--- a/mesalib/src/glsl/ast_to_hir.cpp
+++ b/mesalib/src/glsl/ast_to_hir.cpp
@@ -857,14 +857,11 @@ do_comparison(void *mem_ctx, int operation, ir_rvalue *op0, ir_rvalue *op1)
    case GLSL_TYPE_ERROR:
    case GLSL_TYPE_VOID:
    case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_INTERFACE:
       /* I assume a comparison of a struct containing a sampler just
        * ignores the sampler present in the type.
        */
       break;
-
-   default:
-      assert(!"Should not get here.");
-      break;
    }
 
    if (cmp == NULL)
@@ -1625,6 +1622,15 @@ ast_expression::hir(exec_list *instructions,
 	 }
       } else if (array->type->array_size() == 0) {
 	 _mesa_glsl_error(&loc, state, "unsized array index must be constant");
+      } else if (array->type->is_array()
+                 && array->type->fields.array->is_interface()) {
+         /* Page 46 in section 4.3.7 of the OpenGL ES 3.00 spec says:
+          *
+          *     "All indexes used to index a uniform block array must be
+          *     constant integral expressions."
+          */
+         _mesa_glsl_error(&loc, state,
+                          "uniform block array index must be constant");
       } else {
 	 if (array->type->is_array()) {
 	    /* whole_variable_referenced can return NULL if the array is a
@@ -1924,11 +1930,11 @@ is_varying_var(ir_variable *var, _mesa_glsl_parser_targets target)
 {
    switch (target) {
    case vertex_shader:
-      return var->mode == ir_var_out;
+      return var->mode == ir_var_shader_out;
    case fragment_shader:
-      return var->mode == ir_var_in;
+      return var->mode == ir_var_shader_in;
    default:
-      return var->mode == ir_var_out || var->mode == ir_var_in;
+      return var->mode == ir_var_shader_out || var->mode == ir_var_shader_in;
    }
 }
 
@@ -1997,13 +2003,16 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
     * the setting alone.
     */
    if (qual->flags.q.in && qual->flags.q.out)
-      var->mode = ir_var_inout;
-   else if (qual->flags.q.attribute || qual->flags.q.in
+      var->mode = ir_var_function_inout;
+   else if (qual->flags.q.in)
+      var->mode = is_parameter ? ir_var_function_in : ir_var_shader_in;
+   else if (qual->flags.q.attribute
 	    || (qual->flags.q.varying && (state->target == fragment_shader)))
-      var->mode = ir_var_in;
-   else if (qual->flags.q.out
-	    || (qual->flags.q.varying && (state->target == vertex_shader)))
-      var->mode = ir_var_out;
+      var->mode = ir_var_shader_in;
+   else if (qual->flags.q.out)
+      var->mode = is_parameter ? ir_var_function_out : ir_var_shader_out;
+   else if (qual->flags.q.varying && (state->target == vertex_shader))
+      var->mode = ir_var_shader_out;
    else if (qual->flags.q.uniform)
       var->mode = ir_var_uniform;
 
@@ -2028,10 +2037,8 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
        * Similar text exists in the section on vertex shader outputs.
        *
        * Similar text exists in the GLSL ES 3.00 spec, except that the GLSL ES
-       * 3.00 spec claims to allow structs as well.  However, this is likely
-       * an error, since section 11 of the spec ("Counting of Inputs and
-       * Outputs") enumerates all possible types of interstage linkage
-       * variables, and it does not mention structs.
+       * 3.00 spec allows structs as well.  Varying structs are also allowed
+       * in GLSL 1.50.
        */
       switch (var->type->get_scalar_type()->base_type) {
       case GLSL_TYPE_FLOAT:
@@ -2046,6 +2053,8 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
                           state->get_version_string());
          break;
       case GLSL_TYPE_STRUCT:
+         if (state->is_version(150, 300))
+            break;
          _mesa_glsl_error(loc, state,
                           "varying variables may not be of type struct");
          break;
@@ -2058,15 +2067,16 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
    if (state->all_invariant && (state->current_function == NULL)) {
       switch (state->target) {
       case vertex_shader:
-	 if (var->mode == ir_var_out)
+	 if (var->mode == ir_var_shader_out)
 	    var->invariant = true;
 	 break;
       case geometry_shader:
-	 if ((var->mode == ir_var_in) || (var->mode == ir_var_out))
+	 if ((var->mode == ir_var_shader_in)
+             || (var->mode == ir_var_shader_out))
 	    var->invariant = true;
 	 break;
       case fragment_shader:
-	 if (var->mode == ir_var_in)
+	 if (var->mode == ir_var_shader_in)
 	    var->invariant = true;
 	 break;
       }
@@ -2082,8 +2092,8 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
       var->interpolation = INTERP_QUALIFIER_NONE;
 
    if (var->interpolation != INTERP_QUALIFIER_NONE &&
-       !(state->target == vertex_shader && var->mode == ir_var_out) &&
-       !(state->target == fragment_shader && var->mode == ir_var_in)) {
+       !(state->target == vertex_shader && var->mode == ir_var_shader_out) &&
+       !(state->target == fragment_shader && var->mode == ir_var_shader_in)) {
       _mesa_glsl_error(loc, state,
 		       "interpolation qualifier `%s' can only be applied to "
 		       "vertex shader outputs and fragment shader inputs.",
@@ -2116,7 +2126,7 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
        */
       switch (state->target) {
       case vertex_shader:
-	 if (!global_scope || (var->mode != ir_var_in)) {
+	 if (!global_scope || (var->mode != ir_var_shader_in)) {
 	    fail = true;
 	    string = "input";
 	 }
@@ -2129,7 +2139,7 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
 	 break;
 
       case fragment_shader:
-	 if (!global_scope || (var->mode != ir_var_out)) {
+	 if (!global_scope || (var->mode != ir_var_shader_out)) {
 	    fail = true;
 	    string = "output";
 	 }
@@ -2440,7 +2450,7 @@ process_initializer(ir_variable *var, ast_declaration *decl,
 		       "cannot initialize samplers");
    }
 
-   if ((var->mode == ir_var_in) && (state->current_function == NULL)) {
+   if ((var->mode == ir_var_shader_in) && (state->current_function == NULL)) {
       _mesa_glsl_error(& initializer_loc, state,
 		       "cannot initialize %s shader input / %s",
 		       _mesa_glsl_shader_target_name(state->target),
@@ -2579,12 +2589,12 @@ ast_declarator_list::hir(exec_list *instructions,
 			     "Undeclared variable `%s' cannot be marked "
 			     "invariant\n", decl->identifier);
 	 } else if ((state->target == vertex_shader)
-	       && (earlier->mode != ir_var_out)) {
+	       && (earlier->mode != ir_var_shader_out)) {
 	    _mesa_glsl_error(& loc, state,
 			     "`%s' cannot be marked invariant, vertex shader "
 			     "outputs only\n", decl->identifier);
 	 } else if ((state->target == fragment_shader)
-	       && (earlier->mode != ir_var_in)) {
+	       && (earlier->mode != ir_var_shader_in)) {
 	    _mesa_glsl_error(& loc, state,
 			     "`%s' cannot be marked invariant, fragment shader "
 			     "inputs only\n", decl->identifier);
@@ -2707,16 +2717,13 @@ ast_declarator_list::hir(exec_list *instructions,
 				       & loc, this->ubo_qualifiers_valid, false);
 
       if (this->type->qualifier.flags.q.invariant) {
-	 if ((state->target == vertex_shader) && !(var->mode == ir_var_out ||
-						   var->mode == ir_var_inout)) {
-	    /* FINISHME: Note that this doesn't work for invariant on
-	     * a function signature outval
-	     */
+	 if ((state->target == vertex_shader) &&
+             var->mode != ir_var_shader_out) {
 	    _mesa_glsl_error(& loc, state,
 			     "`%s' cannot be marked invariant, vertex shader "
 			     "outputs only\n", var->name);
 	 } else if ((state->target == fragment_shader) &&
-		    !(var->mode == ir_var_in || var->mode == ir_var_inout)) {
+		    var->mode != ir_var_shader_in) {
 	    /* FINISHME: Note that this doesn't work for invariant on
 	     * a function signature inval
 	     */
@@ -2753,7 +2760,7 @@ ast_declarator_list::hir(exec_list *instructions,
 			     "global scope%s",
 			     mode, var->name, extra);
 	 }
-      } else if (var->mode == ir_var_in) {
+      } else if (var->mode == ir_var_shader_in) {
          var->read_only = true;
 
 	 if (state->target == vertex_shader) {
@@ -2833,7 +2840,7 @@ ast_declarator_list::hir(exec_list *instructions,
           && state->target == vertex_shader
           && state->current_function == NULL
           && var->type->is_integer()
-          && var->mode == ir_var_out
+          && var->mode == ir_var_shader_out
           && var->interpolation != INTERP_QUALIFIER_FLAT) {
 
          _mesa_glsl_error(&loc, state, "If a vertex output is an integer, "
@@ -3137,7 +3144,8 @@ ast_parameter_declarator::hir(exec_list *instructions,
    }
 
    is_void = false;
-   ir_variable *var = new(ctx) ir_variable(type, this->identifier, ir_var_in);
+   ir_variable *var = new(ctx)
+      ir_variable(type, this->identifier, ir_var_function_in);
 
    /* Apply any specified qualifiers to the parameter declaration.  Note that
     * for function parameters the default mode is 'in'.
@@ -3151,7 +3159,7 @@ ast_parameter_declarator::hir(exec_list *instructions,
     *    as out or inout function parameters, nor can they be assigned
     *    into."
     */
-   if ((var->mode == ir_var_inout || var->mode == ir_var_out)
+   if ((var->mode == ir_var_function_inout || var->mode == ir_var_function_out)
        && type->contains_sampler()) {
       _mesa_glsl_error(&loc, state, "out and inout parameters cannot contain samplers");
       type = glsl_type::error_type;
@@ -3171,7 +3179,7 @@ ast_parameter_declarator::hir(exec_list *instructions,
     * So for GLSL 1.10, passing an array as an out or inout parameter is not
     * allowed.  This restriction is removed in GLSL 1.20, and in GLSL ES.
     */
-   if ((var->mode == ir_var_inout || var->mode == ir_var_out)
+   if ((var->mode == ir_var_function_inout || var->mode == ir_var_function_out)
        && type->is_array()
        && !state->check_version(120, 100, &loc,
                                 "Arrays cannot be out or inout parameters")) {
@@ -4018,35 +4026,50 @@ ast_type_specifier::hir(exec_list *instructions,
 }
 
 
-ir_rvalue *
-ast_struct_specifier::hir(exec_list *instructions,
-			  struct _mesa_glsl_parse_state *state)
+/**
+ * Process a structure or interface block tree into an array of structure fields
+ *
+ * After parsing, where there are some syntax differnces, structures and
+ * interface blocks are almost identical.  They are similar enough that the
+ * AST for each can be processed the same way into a set of
+ * \c glsl_struct_field to describe the members.
+ *
+ * \return
+ * The number of fields processed.  A pointer to the array structure fields is
+ * stored in \c *fields_ret.
+ */
+unsigned
+ast_process_structure_or_interface_block(exec_list *instructions,
+					 struct _mesa_glsl_parse_state *state,
+					 exec_list *declarations,
+					 YYLTYPE &loc,
+					 glsl_struct_field **fields_ret,
+                                         bool is_interface,
+                                         bool block_row_major)
 {
    unsigned decl_count = 0;
 
-   /* Make an initial pass over the list of structure fields to determine how
+   /* Make an initial pass over the list of fields to determine how
     * many there are.  Each element in this list is an ast_declarator_list.
     * This means that we actually need to count the number of elements in the
     * 'declarations' list in each of the elements.
     */
-   foreach_list_typed (ast_declarator_list, decl_list, link,
-		       &this->declarations) {
+   foreach_list_typed (ast_declarator_list, decl_list, link, declarations) {
       foreach_list_const (decl_ptr, & decl_list->declarations) {
 	 decl_count++;
       }
    }
 
-   /* Allocate storage for the structure fields and process the field
+   /* Allocate storage for the fields and process the field
     * declarations.  As the declarations are processed, try to also convert
     * the types to HIR.  This ensures that structure definitions embedded in
-    * other structure definitions are processed.
+    * other structure definitions or in interface blocks are processed.
     */
    glsl_struct_field *const fields = ralloc_array(state, glsl_struct_field,
 						  decl_count);
 
    unsigned i = 0;
-   foreach_list_typed (ast_declarator_list, decl_list, link,
-		       &this->declarations) {
+   foreach_list_typed (ast_declarator_list, decl_list, link, declarations) {
       const char *type_name;
 
       decl_list->type->specifier->hir(instructions, state);
@@ -4055,7 +4078,6 @@ ast_struct_specifier::hir(exec_list *instructions,
        * embedded structure definitions have been removed from the language.
        */
       if (state->es_shader && decl_list->type->specifier->structure != NULL) {
-	 YYLTYPE loc = this->get_location();
 	 _mesa_glsl_error(&loc, state, "Embedded structure definitions are "
 			  "not allowed in GLSL ES 1.00.");
       }
@@ -4065,25 +4087,88 @@ ast_struct_specifier::hir(exec_list *instructions,
 
       foreach_list_typed (ast_declaration, decl, link,
 			  &decl_list->declarations) {
-	 const struct glsl_type *field_type = decl_type;
+         /* From the GL_ARB_uniform_buffer_object spec:
+          *
+          *     "Sampler types are not allowed inside of uniform
+          *      blocks. All other types, arrays, and structures
+          *      allowed for uniforms are allowed within a uniform
+          *      block."
+          */
+         const struct glsl_type *field_type = decl_type;
+
+         if (is_interface && field_type->contains_sampler()) {
+            YYLTYPE loc = decl_list->get_location();
+            _mesa_glsl_error(&loc, state,
+                             "Uniform in non-default uniform block contains sampler\n");
+         }
+
+         const struct ast_type_qualifier *const qual =
+            & decl_list->type->qualifier;
+         if (qual->flags.q.std140 ||
+             qual->flags.q.packed ||
+             qual->flags.q.shared) {
+            _mesa_glsl_error(&loc, state,
+                             "uniform block layout qualifiers std140, packed, and "
+                             "shared can only be applied to uniform blocks, not "
+                             "members");
+         }
+
 	 if (decl->is_array) {
-	    YYLTYPE loc = decl->get_location();
 	    field_type = process_array_type(&loc, decl_type, decl->array_size,
 					    state);
 	 }
 	 fields[i].type = (field_type != NULL)
 	    ? field_type : glsl_type::error_type;
 	 fields[i].name = decl->identifier;
+
+         if (qual->flags.q.row_major || qual->flags.q.column_major) {
+            if (!field_type->is_matrix() && !field_type->is_record()) {
+               _mesa_glsl_error(&loc, state,
+                                "uniform block layout qualifiers row_major and "
+                                "column_major can only be applied to matrix and "
+                                "structure types");
+            } else
+               validate_matrix_layout_for_type(state, &loc, field_type);
+         }
+
+         if (field_type->is_matrix() ||
+             (field_type->is_array() && field_type->fields.array->is_matrix())) {
+            fields[i].row_major = block_row_major;
+            if (qual->flags.q.row_major)
+               fields[i].row_major = true;
+            else if (qual->flags.q.column_major)
+               fields[i].row_major = false;
+         }
+
 	 i++;
       }
    }
 
    assert(i == decl_count);
 
+   *fields_ret = fields;
+   return decl_count;
+}
+
+
+ir_rvalue *
+ast_struct_specifier::hir(exec_list *instructions,
+			  struct _mesa_glsl_parse_state *state)
+{
+   YYLTYPE loc = this->get_location();
+   glsl_struct_field *fields;
+   unsigned decl_count =
+      ast_process_structure_or_interface_block(instructions,
+					       state,
+					       &this->declarations,
+					       loc,
+					       &fields,
+                                               false,
+                                               false);
+
    const glsl_type *t =
       glsl_type::get_record_instance(fields, decl_count, this->name);
 
-   YYLTYPE loc = this->get_location();
    if (!state->symbols->add_type(name, t)) {
       _mesa_glsl_error(& loc, state, "struct `%s' previously defined", name);
    } else {
@@ -4102,96 +4187,98 @@ ast_struct_specifier::hir(exec_list *instructions,
    return NULL;
 }
 
-static struct gl_uniform_block *
-get_next_uniform_block(struct _mesa_glsl_parse_state *state)
-{
-   if (state->num_uniform_blocks >= state->uniform_block_array_size) {
-      state->uniform_block_array_size *= 2;
-      if (state->uniform_block_array_size <= 4)
-	 state->uniform_block_array_size = 4;
-
-      state->uniform_blocks = reralloc(state,
-				       state->uniform_blocks,
-				       struct gl_uniform_block,
-				       state->uniform_block_array_size);
-   }
-
-   memset(&state->uniform_blocks[state->num_uniform_blocks],
-	  0, sizeof(*state->uniform_blocks));
-   return &state->uniform_blocks[state->num_uniform_blocks++];
-}
-
 ir_rvalue *
 ast_uniform_block::hir(exec_list *instructions,
 		       struct _mesa_glsl_parse_state *state)
 {
+   YYLTYPE loc = this->get_location();
+
    /* The ast_uniform_block has a list of ast_declarator_lists.  We
     * need to turn those into ir_variables with an association
     * with this uniform block.
     */
-   struct gl_uniform_block *ubo = get_next_uniform_block(state);
-   ubo->Name = ralloc_strdup(state->uniform_blocks, this->block_name);
+   enum glsl_interface_packing packing;
+   if (this->layout.flags.q.shared) {
+      packing = GLSL_INTERFACE_PACKING_SHARED;
+   } else if (this->layout.flags.q.packed) {
+      packing = GLSL_INTERFACE_PACKING_PACKED;
+   } else {
+      /* The default layout is std140.
+       */
+      packing = GLSL_INTERFACE_PACKING_STD140;
+   }
 
-   if (!state->symbols->add_uniform_block(ubo)) {
+   bool block_row_major = this->layout.flags.q.row_major;
+   exec_list declared_variables;
+   glsl_struct_field *fields;
+   unsigned int num_variables =
+      ast_process_structure_or_interface_block(&declared_variables,
+                                               state,
+                                               &this->declarations,
+                                               loc,
+                                               &fields,
+                                               true,
+                                               block_row_major);
+
+   const glsl_type *block_type =
+      glsl_type::get_interface_instance(fields,
+                                        num_variables,
+                                        packing,
+                                        this->block_name);
+
+   if (!state->symbols->add_type(block_type->name, block_type)) {
       YYLTYPE loc = this->get_location();
       _mesa_glsl_error(&loc, state, "Uniform block name `%s' already taken in "
-                       "the current scope.\n", ubo->Name);
+                       "the current scope.\n", this->block_name);
    }
 
-   unsigned int num_variables = 0;
-   foreach_list_typed(ast_declarator_list, decl_list, link, &declarations) {
-      foreach_list_const(node, &decl_list->declarations) {
-	 num_variables++;
-      }
-   }
-
-   bool block_row_major = this->layout.flags.q.row_major;
-
-   ubo->Uniforms = rzalloc_array(state->uniform_blocks,
-				 struct gl_uniform_buffer_variable,
-				 num_variables);
-
-   foreach_list_typed(ast_declarator_list, decl_list, link, &declarations) {
-      exec_list declared_variables;
-
-      decl_list->hir(&declared_variables, state);
+   /* Since interface blocks cannot contain statements, it should be
+    * impossible for the block to generate any instructions.
+    */
+   assert(declared_variables.is_empty());
 
-      foreach_list_const(node, &declared_variables) {
-	 ir_variable *var = (ir_variable *)node;
+   /* Page 39 (page 45 of the PDF) of section 4.3.7 in the GLSL ES 3.00 spec
+    * says:
+    *
+    *     "If an instance name (instance-name) is used, then it puts all the
+    *     members inside a scope within its own name space, accessed with the
+    *     field selector ( . ) operator (analogously to structures)."
+    */
+   if (this->instance_name) {
+      ir_variable *var;
 
-	 struct gl_uniform_buffer_variable *ubo_var =
-	    &ubo->Uniforms[ubo->NumUniforms++];
+      if (this->array_size != NULL) {
+         const glsl_type *block_array_type =
+            process_array_type(&loc, block_type, this->array_size, state);
 
-	 var->uniform_block = ubo - state->uniform_blocks;
+         var = new(state) ir_variable(block_array_type,
+                                      this->instance_name,
+                                      ir_var_uniform);
+      } else {
+         var = new(state) ir_variable(block_type,
+                                      this->instance_name,
+                                      ir_var_uniform);
+      }
 
-	 ubo_var->Name = ralloc_strdup(state->uniform_blocks, var->name);
-	 ubo_var->Type = var->type;
-	 ubo_var->Offset = 0; /* Assigned at link time. */
+      var->interface_type = block_type;
+      state->symbols->add_variable(var);
+      instructions->push_tail(var);
+   } else {
+      /* In order to have an array size, the block must also be declared with
+       * an instane name.
+       */
+      assert(this->array_size == NULL);
 
-	 if (var->type->is_matrix() ||
-	     (var->type->is_array() && var->type->fields.array->is_matrix())) {
-	    ubo_var->RowMajor = block_row_major;
-	    if (decl_list->type->qualifier.flags.q.row_major)
-	       ubo_var->RowMajor = true;
-	    else if (decl_list->type->qualifier.flags.q.column_major)
-	       ubo_var->RowMajor = false;
-	 }
+      for (unsigned i = 0; i < num_variables; i++) {
+         ir_variable *var =
+            new(state) ir_variable(fields[i].type,
+                                   ralloc_strdup(state, fields[i].name),
+                                   ir_var_uniform);
+         var->interface_type = block_type;
 
-	 /* From the GL_ARB_uniform_buffer_object spec:
-	  *
-	  *     "Sampler types are not allowed inside of uniform
-	  *      blocks. All other types, arrays, and structures
-	  *      allowed for uniforms are allowed within a uniform
-	  *      block."
-	  */
-	 if (var->type->contains_sampler()) {
-	    YYLTYPE loc = decl_list->get_location();
-	    _mesa_glsl_error(&loc, state,
-			     "Uniform in non-default uniform block contains sampler\n");
-	 }
+         state->symbols->add_variable(var);
+         instructions->push_tail(var);
       }
-
-      instructions->append_list(&declared_variables);
    }
 
    return NULL;
@@ -4222,7 +4309,7 @@ detect_conflicting_assignments(struct _mesa_glsl_parse_state *state,
 	 gl_FragData_assigned = true;
       else if (strncmp(var->name, "gl_", 3) != 0) {
 	 if (state->target == fragment_shader &&
-	     (var->mode == ir_var_out || var->mode == ir_var_inout)) {
+	     var->mode == ir_var_shader_out) {
 	    user_defined_fs_output_assigned = true;
 	    user_defined_fs_output = var;
 	 }
diff --git a/mesalib/src/glsl/builtin_compiler/Makefile.am b/mesalib/src/glsl/builtin_compiler/Makefile.am
index 1a863b228..976640822 100644
--- a/mesalib/src/glsl/builtin_compiler/Makefile.am
+++ b/mesalib/src/glsl/builtin_compiler/Makefile.am
@@ -55,6 +55,7 @@ libglslcore_la_SOURCES =				\
 
 builtin_compiler_SOURCES = \
 	$(top_srcdir)/src/mesa/main/hash_table.c	\
+	$(top_srcdir)/src/mesa/main/imports.c		\
 	$(top_srcdir)/src/mesa/program/prog_hash_table.c\
 	$(top_srcdir)/src/mesa/program/symbol_table.c	\
 	$(BUILTIN_COMPILER_CXX_FILES)			\
diff --git a/mesalib/src/glsl/builtin_types.h b/mesalib/src/glsl/builtin_types.h
index a4c995fd1..c78c2d270 100644
--- a/mesalib/src/glsl/builtin_types.h
+++ b/mesalib/src/glsl/builtin_types.h
@@ -89,9 +89,9 @@ const glsl_type *const glsl_type::mat4_type = & builtin_core_types[14];
 /*@{*/
 
 static const struct glsl_struct_field gl_DepthRangeParameters_fields[] = {
-   { glsl_type::float_type, "near" },
-   { glsl_type::float_type, "far" },
-   { glsl_type::float_type, "diff" },
+   { glsl_type::float_type, "near", false },
+   { glsl_type::float_type, "far", false },
+   { glsl_type::float_type, "diff", false },
 };
 
 const glsl_type glsl_type::builtin_structure_types[] = {
@@ -106,58 +106,58 @@ const glsl_type glsl_type::builtin_structure_types[] = {
 /*@{*/
 
 static const struct glsl_struct_field gl_PointParameters_fields[] = {
-   { glsl_type::float_type, "size" },
-   { glsl_type::float_type, "sizeMin" },
-   { glsl_type::float_type, "sizeMax" },
-   { glsl_type::float_type, "fadeThresholdSize" },
-   { glsl_type::float_type, "distanceConstantAttenuation" },
-   { glsl_type::float_type, "distanceLinearAttenuation" },
-   { glsl_type::float_type, "distanceQuadraticAttenuation" },
+   { glsl_type::float_type, "size", false },
+   { glsl_type::float_type, "sizeMin", false },
+   { glsl_type::float_type, "sizeMax", false },
+   { glsl_type::float_type, "fadeThresholdSize", false },
+   { glsl_type::float_type, "distanceConstantAttenuation", false },
+   { glsl_type::float_type, "distanceLinearAttenuation", false },
+   { glsl_type::float_type, "distanceQuadraticAttenuation", false },
 };
 
 static const struct glsl_struct_field gl_MaterialParameters_fields[] = {
-   { glsl_type::vec4_type, "emission" },
-   { glsl_type::vec4_type, "ambient" },
-   { glsl_type::vec4_type, "diffuse" },
-   { glsl_type::vec4_type, "specular" },
-   { glsl_type::float_type, "shininess" },
+   { glsl_type::vec4_type, "emission", false },
+   { glsl_type::vec4_type, "ambient", false },
+   { glsl_type::vec4_type, "diffuse", false },
+   { glsl_type::vec4_type, "specular", false },
+   { glsl_type::float_type, "shininess", false },
 };
 
 static const struct glsl_struct_field gl_LightSourceParameters_fields[] = {
-   { glsl_type::vec4_type, "ambient" },
-   { glsl_type::vec4_type, "diffuse" },
-   { glsl_type::vec4_type, "specular" },
-   { glsl_type::vec4_type, "position" },
-   { glsl_type::vec4_type, "halfVector" },
-   { glsl_type::vec3_type, "spotDirection" },
-   { glsl_type::float_type, "spotExponent" },
-   { glsl_type::float_type, "spotCutoff" },
-   { glsl_type::float_type, "spotCosCutoff" },
-   { glsl_type::float_type, "constantAttenuation" },
-   { glsl_type::float_type, "linearAttenuation" },
-   { glsl_type::float_type, "quadraticAttenuation" },
+   { glsl_type::vec4_type, "ambient", false },
+   { glsl_type::vec4_type, "diffuse", false },
+   { glsl_type::vec4_type, "specular", false },
+   { glsl_type::vec4_type, "position", false },
+   { glsl_type::vec4_type, "halfVector", false },
+   { glsl_type::vec3_type, "spotDirection", false },
+   { glsl_type::float_type, "spotExponent", false },
+   { glsl_type::float_type, "spotCutoff", false },
+   { glsl_type::float_type, "spotCosCutoff", false },
+   { glsl_type::float_type, "constantAttenuation", false },
+   { glsl_type::float_type, "linearAttenuation", false },
+   { glsl_type::float_type, "quadraticAttenuation", false },
 };
 
 static const struct glsl_struct_field gl_LightModelParameters_fields[] = {
-   { glsl_type::vec4_type, "ambient" },
+   { glsl_type::vec4_type, "ambient", false },
 };
 
 static const struct glsl_struct_field gl_LightModelProducts_fields[] = {
-   { glsl_type::vec4_type, "sceneColor" },
+   { glsl_type::vec4_type, "sceneColor", false },
 };
 
 static const struct glsl_struct_field gl_LightProducts_fields[] = {
-   { glsl_type::vec4_type, "ambient" },
-   { glsl_type::vec4_type, "diffuse" },
-   { glsl_type::vec4_type, "specular" },
+   { glsl_type::vec4_type, "ambient", false },
+   { glsl_type::vec4_type, "diffuse", false },
+   { glsl_type::vec4_type, "specular", false },
 };
 
 static const struct glsl_struct_field gl_FogParameters_fields[] = {
-   { glsl_type::vec4_type, "color" },
-   { glsl_type::float_type, "density" },
-   { glsl_type::float_type, "start" },
-   { glsl_type::float_type, "end" },
-   { glsl_type::float_type, "scale" },
+   { glsl_type::vec4_type, "color", false },
+   { glsl_type::float_type, "density", false },
+   { glsl_type::float_type, "start", false },
+   { glsl_type::float_type, "end", false },
+   { glsl_type::float_type, "scale", false },
 };
 
 const glsl_type glsl_type::builtin_110_deprecated_structure_types[] = {
diff --git a/mesalib/src/glsl/builtin_variables.cpp b/mesalib/src/glsl/builtin_variables.cpp
index e7769419f..ccee7746e 100644
--- a/mesalib/src/glsl/builtin_variables.cpp
+++ b/mesalib/src/glsl/builtin_variables.cpp
@@ -47,18 +47,18 @@ struct builtin_variable {
 };
 
 static const builtin_variable builtin_core_vs_variables[] = {
-   { ir_var_out, VERT_RESULT_HPOS, "vec4",  "gl_Position" },
-   { ir_var_out, VERT_RESULT_PSIZ, "float", "gl_PointSize" },
+   { ir_var_shader_out, VERT_RESULT_HPOS, "vec4",  "gl_Position" },
+   { ir_var_shader_out, VERT_RESULT_PSIZ, "float", "gl_PointSize" },
 };
 
 static const builtin_variable builtin_core_fs_variables[] = {
-   { ir_var_in,  FRAG_ATTRIB_WPOS,  "vec4",  "gl_FragCoord" },
-   { ir_var_in,  FRAG_ATTRIB_FACE,  "bool",  "gl_FrontFacing" },
-   { ir_var_out, FRAG_RESULT_COLOR, "vec4",  "gl_FragColor" },
+   { ir_var_shader_in,  FRAG_ATTRIB_WPOS,  "vec4",  "gl_FragCoord" },
+   { ir_var_shader_in,  FRAG_ATTRIB_FACE,  "bool",  "gl_FrontFacing" },
+   { ir_var_shader_out, FRAG_RESULT_COLOR, "vec4",  "gl_FragColor" },
 };
 
 static const builtin_variable builtin_100ES_fs_variables[] = {
-   { ir_var_in,  FRAG_ATTRIB_PNTC,   "vec2",   "gl_PointCoord" },
+   { ir_var_shader_in,  FRAG_ATTRIB_PNTC,   "vec2",   "gl_PointCoord" },
 };
 
 static const builtin_variable builtin_300ES_vs_variables[] = {
@@ -66,46 +66,46 @@ static const builtin_variable builtin_300ES_vs_variables[] = {
 };
 
 static const builtin_variable builtin_300ES_fs_variables[] = {
-   { ir_var_in,  FRAG_ATTRIB_WPOS,  "vec4",  "gl_FragCoord" },
-   { ir_var_in,  FRAG_ATTRIB_FACE,  "bool",  "gl_FrontFacing" },
-   { ir_var_out, FRAG_RESULT_DEPTH, "float", "gl_FragDepth" },
-   { ir_var_in,  FRAG_ATTRIB_PNTC,   "vec2",   "gl_PointCoord" },
+   { ir_var_shader_in,  FRAG_ATTRIB_WPOS,  "vec4",  "gl_FragCoord" },
+   { ir_var_shader_in,  FRAG_ATTRIB_FACE,  "bool",  "gl_FrontFacing" },
+   { ir_var_shader_out, FRAG_RESULT_DEPTH, "float", "gl_FragDepth" },
+   { ir_var_shader_in,  FRAG_ATTRIB_PNTC,   "vec2",   "gl_PointCoord" },
 };
 
 static const builtin_variable builtin_110_fs_variables[] = {
-   { ir_var_out, FRAG_RESULT_DEPTH, "float", "gl_FragDepth" },
+   { ir_var_shader_out, FRAG_RESULT_DEPTH, "float", "gl_FragDepth" },
 };
 
 static const builtin_variable builtin_110_deprecated_fs_variables[] = {
-   { ir_var_in,  FRAG_ATTRIB_COL0,  "vec4",  "gl_Color" },
-   { ir_var_in,  FRAG_ATTRIB_COL1,  "vec4",  "gl_SecondaryColor" },
-   { ir_var_in,  FRAG_ATTRIB_FOGC,  "float", "gl_FogFragCoord" },
+   { ir_var_shader_in,  FRAG_ATTRIB_COL0,  "vec4",  "gl_Color" },
+   { ir_var_shader_in,  FRAG_ATTRIB_COL1,  "vec4",  "gl_SecondaryColor" },
+   { ir_var_shader_in,  FRAG_ATTRIB_FOGC,  "float", "gl_FogFragCoord" },
 };
 
 static const builtin_variable builtin_110_deprecated_vs_variables[] = {
-   { ir_var_in,  VERT_ATTRIB_POS,         "vec4",  "gl_Vertex" },
-   { ir_var_in,  VERT_ATTRIB_NORMAL,      "vec3",  "gl_Normal" },
-   { ir_var_in,  VERT_ATTRIB_COLOR0,      "vec4",  "gl_Color" },
-   { ir_var_in,  VERT_ATTRIB_COLOR1,      "vec4",  "gl_SecondaryColor" },
-   { ir_var_in,  VERT_ATTRIB_TEX0,        "vec4",  "gl_MultiTexCoord0" },
-   { ir_var_in,  VERT_ATTRIB_TEX1,        "vec4",  "gl_MultiTexCoord1" },
-   { ir_var_in,  VERT_ATTRIB_TEX2,        "vec4",  "gl_MultiTexCoord2" },
-   { ir_var_in,  VERT_ATTRIB_TEX3,        "vec4",  "gl_MultiTexCoord3" },
-   { ir_var_in,  VERT_ATTRIB_TEX4,        "vec4",  "gl_MultiTexCoord4" },
-   { ir_var_in,  VERT_ATTRIB_TEX5,        "vec4",  "gl_MultiTexCoord5" },
-   { ir_var_in,  VERT_ATTRIB_TEX6,        "vec4",  "gl_MultiTexCoord6" },
-   { ir_var_in,  VERT_ATTRIB_TEX7,        "vec4",  "gl_MultiTexCoord7" },
-   { ir_var_in,  VERT_ATTRIB_FOG,         "float", "gl_FogCoord" },
-   { ir_var_out, VERT_RESULT_CLIP_VERTEX, "vec4",  "gl_ClipVertex" },
-   { ir_var_out, VERT_RESULT_COL0,        "vec4",  "gl_FrontColor" },
-   { ir_var_out, VERT_RESULT_BFC0,        "vec4",  "gl_BackColor" },
-   { ir_var_out, VERT_RESULT_COL1,        "vec4",  "gl_FrontSecondaryColor" },
-   { ir_var_out, VERT_RESULT_BFC1,        "vec4",  "gl_BackSecondaryColor" },
-   { ir_var_out, VERT_RESULT_FOGC,        "float", "gl_FogFragCoord" },
+   { ir_var_shader_in,  VERT_ATTRIB_POS,         "vec4",  "gl_Vertex" },
+   { ir_var_shader_in,  VERT_ATTRIB_NORMAL,      "vec3",  "gl_Normal" },
+   { ir_var_shader_in,  VERT_ATTRIB_COLOR0,      "vec4",  "gl_Color" },
+   { ir_var_shader_in,  VERT_ATTRIB_COLOR1,      "vec4",  "gl_SecondaryColor" },
+   { ir_var_shader_in,  VERT_ATTRIB_TEX0,        "vec4",  "gl_MultiTexCoord0" },
+   { ir_var_shader_in,  VERT_ATTRIB_TEX1,        "vec4",  "gl_MultiTexCoord1" },
+   { ir_var_shader_in,  VERT_ATTRIB_TEX2,        "vec4",  "gl_MultiTexCoord2" },
+   { ir_var_shader_in,  VERT_ATTRIB_TEX3,        "vec4",  "gl_MultiTexCoord3" },
+   { ir_var_shader_in,  VERT_ATTRIB_TEX4,        "vec4",  "gl_MultiTexCoord4" },
+   { ir_var_shader_in,  VERT_ATTRIB_TEX5,        "vec4",  "gl_MultiTexCoord5" },
+   { ir_var_shader_in,  VERT_ATTRIB_TEX6,        "vec4",  "gl_MultiTexCoord6" },
+   { ir_var_shader_in,  VERT_ATTRIB_TEX7,        "vec4",  "gl_MultiTexCoord7" },
+   { ir_var_shader_in,  VERT_ATTRIB_FOG,         "float", "gl_FogCoord" },
+   { ir_var_shader_out, VERT_RESULT_CLIP_VERTEX, "vec4",  "gl_ClipVertex" },
+   { ir_var_shader_out, VERT_RESULT_COL0,        "vec4",  "gl_FrontColor" },
+   { ir_var_shader_out, VERT_RESULT_BFC0,        "vec4",  "gl_BackColor" },
+   { ir_var_shader_out, VERT_RESULT_COL1,        "vec4",  "gl_FrontSecondaryColor" },
+   { ir_var_shader_out, VERT_RESULT_BFC1,        "vec4",  "gl_BackSecondaryColor" },
+   { ir_var_shader_out, VERT_RESULT_FOGC,        "float", "gl_FogFragCoord" },
 };
 
 static const builtin_variable builtin_120_fs_variables[] = {
-   { ir_var_in,  FRAG_ATTRIB_PNTC,   "vec2",   "gl_PointCoord" },
+   { ir_var_shader_in,  FRAG_ATTRIB_PNTC,   "vec2",   "gl_PointCoord" },
 };
 
 static const builtin_variable builtin_130_vs_variables[] = {
@@ -403,16 +403,18 @@ add_variable(exec_list *instructions, glsl_symbol_table *symtab,
 
    switch (var->mode) {
    case ir_var_auto:
-   case ir_var_in:
-   case ir_var_const_in:
+   case ir_var_shader_in:
    case ir_var_uniform:
    case ir_var_system_value:
       var->read_only = true;
       break;
-   case ir_var_inout:
-   case ir_var_out:
+   case ir_var_shader_out:
       break;
    default:
+      /* The only variables that are added using this function should be
+       * uniforms, shader inputs, and shader outputs, constants (which use
+       * ir_var_auto), and system values.
+       */
       assert(0);
       break;
    }
@@ -752,7 +754,8 @@ generate_110_vs_variables(exec_list *instructions,
       glsl_type::get_array_instance(glsl_type::vec4_type, 0);
 
    add_variable(instructions, state->symbols,
-		"gl_TexCoord", vec4_array_type, ir_var_out, VERT_RESULT_TEX0);
+		"gl_TexCoord", vec4_array_type, ir_var_shader_out,
+                VERT_RESULT_TEX0);
 
    generate_ARB_draw_buffers_variables(instructions, state, false,
 				       vertex_shader);
@@ -812,7 +815,7 @@ generate_130_vs_variables(exec_list *instructions,
       glsl_type::get_array_instance(glsl_type::float_type, 0);
 
    add_variable(instructions, state->symbols,
-		"gl_ClipDistance", clip_distance_array_type, ir_var_out,
+		"gl_ClipDistance", clip_distance_array_type, ir_var_shader_out,
                 VERT_RESULT_CLIP_DIST0);
 
 }
@@ -937,7 +940,8 @@ generate_110_fs_variables(exec_list *instructions,
       glsl_type::get_array_instance(glsl_type::vec4_type, 0);
 
    add_variable(instructions, state->symbols,
-		"gl_TexCoord", vec4_array_type, ir_var_in, FRAG_ATTRIB_TEX0);
+		"gl_TexCoord", vec4_array_type, ir_var_shader_in,
+                FRAG_ATTRIB_TEX0);
 
    generate_ARB_draw_buffers_variables(instructions, state, false,
 				       fragment_shader);
@@ -969,7 +973,7 @@ generate_ARB_draw_buffers_variables(exec_list *instructions,
       ir_variable *const fd =
 	 add_variable(instructions, state->symbols,
 		      "gl_FragData", vec4_array_type,
-		      ir_var_out, FRAG_RESULT_DATA0);
+		      ir_var_shader_out, FRAG_RESULT_DATA0);
 
       if (warn)
 	 fd->warn_extension = "GL_ARB_draw_buffers";
@@ -1026,7 +1030,7 @@ generate_ARB_shader_stencil_export_variables(exec_list *instructions,
    ir_variable *const fd =
       add_variable(instructions, state->symbols,
 		   "gl_FragStencilRefARB", glsl_type::int_type,
-		   ir_var_out, FRAG_RESULT_STENCIL);
+		   ir_var_shader_out, FRAG_RESULT_STENCIL);
 
    if (warn)
       fd->warn_extension = "GL_ARB_shader_stencil_export";
@@ -1042,7 +1046,7 @@ generate_AMD_shader_stencil_export_variables(exec_list *instructions,
    ir_variable *const fd =
       add_variable(instructions, state->symbols,
 		   "gl_FragStencilRefAMD", glsl_type::int_type,
-		   ir_var_out, FRAG_RESULT_STENCIL);
+		   ir_var_shader_out, FRAG_RESULT_STENCIL);
 
    if (warn)
       fd->warn_extension = "GL_AMD_shader_stencil_export";
@@ -1083,7 +1087,7 @@ generate_fs_clipdistance(exec_list *instructions,
       glsl_type::get_array_instance(glsl_type::float_type, 0);
 
    add_variable(instructions, state->symbols,
-		"gl_ClipDistance", clip_distance_array_type, ir_var_in,
+		"gl_ClipDistance", clip_distance_array_type, ir_var_shader_in,
                 FRAG_ATTRIB_CLIP_DIST0);
 }
 
diff --git a/mesalib/src/glsl/glcpp/glcpp-parse.y b/mesalib/src/glsl/glcpp/glcpp-parse.y
index 8fba923a2..e927c7cb7 100644
--- a/mesalib/src/glsl/glcpp/glcpp-parse.y
+++ b/mesalib/src/glsl/glcpp/glcpp-parse.y
@@ -1227,6 +1227,9 @@ glcpp_parser_create (const struct gl_extensions *extensions, int api)
 
 	      if (extensions->ARB_texture_cube_map_array)
 	         add_builtin_define(parser, "GL_ARB_texture_cube_map_array", 1);
+
+	      if (extensions->ARB_shading_language_packing)
+	         add_builtin_define(parser, "GL_ARB_shading_language_packing", 1);
 	   }
 	}
 
diff --git a/mesalib/src/glsl/glsl_lexer.ll b/mesalib/src/glsl/glsl_lexer.ll
index 2f66c5828..ddc9f8073 100644
--- a/mesalib/src/glsl/glsl_lexer.ll
+++ b/mesalib/src/glsl/glsl_lexer.ll
@@ -399,23 +399,23 @@ layout		{
 			}
 
 [0-9]+\.[0-9]+([eE][+-]?[0-9]+)?[fF]?	{
-			    yylval->real = glsl_strtod(yytext, NULL);
+			    yylval->real = glsl_strtof(yytext, NULL);
 			    return FLOATCONSTANT;
 			}
 \.[0-9]+([eE][+-]?[0-9]+)?[fF]?		{
-			    yylval->real = glsl_strtod(yytext, NULL);
+			    yylval->real = glsl_strtof(yytext, NULL);
 			    return FLOATCONSTANT;
 			}
 [0-9]+\.([eE][+-]?[0-9]+)?[fF]?		{
-			    yylval->real = glsl_strtod(yytext, NULL);
+			    yylval->real = glsl_strtof(yytext, NULL);
 			    return FLOATCONSTANT;
 			}
 [0-9]+[eE][+-]?[0-9]+[fF]?		{
-			    yylval->real = glsl_strtod(yytext, NULL);
+			    yylval->real = glsl_strtof(yytext, NULL);
 			    return FLOATCONSTANT;
 			}
 [0-9]+[fF]		{
-			    yylval->real = glsl_strtod(yytext, NULL);
+			    yylval->real = glsl_strtof(yytext, NULL);
 			    return FLOATCONSTANT;
 			}
 
diff --git a/mesalib/src/glsl/glsl_parser.yy b/mesalib/src/glsl/glsl_parser.yy
index 88aae64d4..154ce2d09 100644
--- a/mesalib/src/glsl/glsl_parser.yy
+++ b/mesalib/src/glsl/glsl_parser.yy
@@ -79,6 +79,7 @@ static void yyerror(YYLTYPE *loc, _mesa_glsl_parse_state *st, const char *msg)
    ast_case_label_list *case_label_list;
    ast_case_statement *case_statement;
    ast_case_statement_list *case_statement_list;
+   ast_uniform_block *uniform_block;
 
    struct {
       ast_node *cond;
@@ -112,6 +113,7 @@ static void yyerror(YYLTYPE *loc, _mesa_glsl_parse_state *st, const char *msg)
 %token STRUCT VOID_TOK WHILE
 %token <identifier> IDENTIFIER TYPE_IDENTIFIER NEW_IDENTIFIER
 %type <identifier> any_identifier
+%type <uniform_block> instance_name_opt
 %token <real> FLOATCONSTANT
 %token <n> INTCONSTANT UINTCONSTANT BOOLCONSTANT
 %token <identifier> FIELD_SELECTION
@@ -221,6 +223,7 @@ static void yyerror(YYLTYPE *loc, _mesa_glsl_parse_state *st, const char *msg)
 %type <node> declaration_statement
 %type <node> jump_statement
 %type <node> uniform_block
+%type <uniform_block> basic_uniform_block
 %type <struct_specifier> struct_specifier
 %type <declarator_list> struct_declaration_list
 %type <declarator_list> struct_declaration
@@ -1884,31 +1887,27 @@ function_definition:
 
 /* layout_qualifieropt is packed into this rule */
 uniform_block:
-	UNIFORM NEW_IDENTIFIER '{' member_list '}' ';'
+	basic_uniform_block
 	{
-	   void *ctx = state;
-	   $$ = new(ctx) ast_uniform_block(*state->default_uniform_qualifier,
-					   $2, $4);
-
-	   if (!state->ARB_uniform_buffer_object_enable) {
-	      _mesa_glsl_error(& @1, state,
-			       "#version 140 / GL_ARB_uniform_buffer_object "
-			       "required for defining uniform blocks\n");
-	   } else if (state->ARB_uniform_buffer_object_warn) {
-	      _mesa_glsl_warning(& @1, state,
-				 "#version 140 / GL_ARB_uniform_buffer_object "
-				 "required for defining uniform blocks\n");
-	   }
+	   $$ = $1;
 	}
-	| layout_qualifier UNIFORM NEW_IDENTIFIER '{' member_list '}' ';'
+	| layout_qualifier basic_uniform_block
 	{
-	   void *ctx = state;
-
-	   ast_type_qualifier qual = *state->default_uniform_qualifier;
-	   if (!qual.merge_qualifier(& @1, state, $1)) {
+	   ast_uniform_block *block = $2;
+	   if (!block->layout.merge_qualifier(& @1, state, $1)) {
 	      YYERROR;
 	   }
-	   $$ = new(ctx) ast_uniform_block(qual, $3, $5);
+	   $$ = block;
+	}
+	;
+
+basic_uniform_block:
+	UNIFORM NEW_IDENTIFIER '{' member_list '}' instance_name_opt ';'
+	{
+	   ast_uniform_block *const block = $6;
+
+	   block->block_name = $2;
+	   block->declarations.push_degenerate_list_at_head(& $4->link);
 
 	   if (!state->ARB_uniform_buffer_object_enable) {
 	      _mesa_glsl_error(& @1, state,
@@ -1919,6 +1918,49 @@ uniform_block:
 				 "#version 140 / GL_ARB_uniform_buffer_object "
 				 "required for defining uniform blocks\n");
 	   }
+
+	   /* Since block arrays require names, and both features are added in
+	    * the same language versions, we don't have to explicitly
+	    * version-check both things.
+	    */
+	   if (block->instance_name != NULL
+	       && !(state->language_version == 300 && state->es_shader)) {
+	      _mesa_glsl_error(& @1, state,
+			       "#version 300 es required for using uniform "
+			       "blocks with an instance name\n");
+	   }
+
+	   $$ = block;
+	}
+	;
+
+instance_name_opt:
+	/* empty */
+	{
+	   $$ = new(state) ast_uniform_block(*state->default_uniform_qualifier,
+					     NULL,
+					     NULL);
+	}
+	| NEW_IDENTIFIER
+	{
+	   $$ = new(state) ast_uniform_block(*state->default_uniform_qualifier,
+					     $1,
+					     NULL);
+	}
+	| NEW_IDENTIFIER '[' constant_expression ']'
+	{
+	   $$ = new(state) ast_uniform_block(*state->default_uniform_qualifier,
+					     $1,
+					     $3);
+	}
+	| NEW_IDENTIFIER '[' ']'
+	{
+	   _mesa_glsl_error(& @1, state,
+			    "instance block arrays must be explicitly sized\n");
+
+	   $$ = new(state) ast_uniform_block(*state->default_uniform_qualifier,
+					     $1,
+					     NULL);
 	}
 	;
 
diff --git a/mesalib/src/glsl/glsl_parser_extras.cpp b/mesalib/src/glsl/glsl_parser_extras.cpp
index b460c8619..c8dbc89ff 100644
--- a/mesalib/src/glsl/glsl_parser_extras.cpp
+++ b/mesalib/src/glsl/glsl_parser_extras.cpp
@@ -462,6 +462,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
    EXT(ARB_uniform_buffer_object,      true,  false, true,  true,  false,     ARB_uniform_buffer_object),
    EXT(OES_standard_derivatives,       false, false, true,  false,  true,     OES_standard_derivatives),
    EXT(ARB_texture_cube_map_array,     true,  false, true,  true,  false,     ARB_texture_cube_map_array),
+   EXT(ARB_shading_language_packing,   true,  false, true,  true,  false,     ARB_shading_language_packing),
 };
 
 #undef EXT
diff --git a/mesalib/src/glsl/glsl_parser_extras.h b/mesalib/src/glsl/glsl_parser_extras.h
index 2e6bb0b0a..53df149d8 100644
--- a/mesalib/src/glsl/glsl_parser_extras.h
+++ b/mesalib/src/glsl/glsl_parser_extras.h
@@ -272,6 +272,8 @@ struct _mesa_glsl_parse_state {
    bool OES_standard_derivatives_warn;
    bool ARB_texture_cube_map_array_enable;
    bool ARB_texture_cube_map_array_warn;
+   bool ARB_shading_language_packing_enable;
+   bool ARB_shading_language_packing_warn;
    /*@}*/
 
    /** Extensions supported by the OpenGL implementation. */
diff --git a/mesalib/src/glsl/glsl_symbol_table.cpp b/mesalib/src/glsl/glsl_symbol_table.cpp
index eb275b12e..8d34547c6 100644
--- a/mesalib/src/glsl/glsl_symbol_table.cpp
+++ b/mesalib/src/glsl/glsl_symbol_table.cpp
@@ -41,15 +41,13 @@ public:
       ralloc_free(entry);
    }
 
-   symbol_table_entry(ir_variable *v)               : v(v), f(0), t(0), u(0) {}
-   symbol_table_entry(ir_function *f)               : v(0), f(f), t(0), u(0) {}
-   symbol_table_entry(const glsl_type *t)           : v(0), f(0), t(t), u(0) {}
-   symbol_table_entry(struct gl_uniform_block *u)   : v(0), f(0), t(0), u(u) {}
+   symbol_table_entry(ir_variable *v)               : v(v), f(0), t(0) {}
+   symbol_table_entry(ir_function *f)               : v(0), f(f), t(0) {}
+   symbol_table_entry(const glsl_type *t)           : v(0), f(0), t(t) {}
 
    ir_variable *v;
    ir_function *f;
    const glsl_type *t;
-   struct gl_uniform_block *u;
 };
 
 glsl_symbol_table::glsl_symbol_table()
@@ -134,12 +132,6 @@ bool glsl_symbol_table::add_function(ir_function *f)
    return _mesa_symbol_table_add_symbol(table, -1, f->name, entry) == 0;
 }
 
-bool glsl_symbol_table::add_uniform_block(struct gl_uniform_block *u)
-{
-   symbol_table_entry *entry = new(mem_ctx) symbol_table_entry(u);
-   return _mesa_symbol_table_add_symbol(table, -1, u->Name, entry) == 0;
-}
-
 void glsl_symbol_table::add_global_function(ir_function *f)
 {
    symbol_table_entry *entry = new(mem_ctx) symbol_table_entry(f);
diff --git a/mesalib/src/glsl/glsl_symbol_table.h b/mesalib/src/glsl/glsl_symbol_table.h
index f95fb8a01..9f5602787 100644
--- a/mesalib/src/glsl/glsl_symbol_table.h
+++ b/mesalib/src/glsl/glsl_symbol_table.h
@@ -99,7 +99,6 @@ public:
    bool add_variable(ir_variable *v);
    bool add_type(const char *name, const glsl_type *t);
    bool add_function(ir_function *f);
-   bool add_uniform_block(struct gl_uniform_block *u);
    /*@}*/
 
    /**
diff --git a/mesalib/src/glsl/glsl_types.cpp b/mesalib/src/glsl/glsl_types.cpp
index 71b185027..4a2c87907 100644
--- a/mesalib/src/glsl/glsl_types.cpp
+++ b/mesalib/src/glsl/glsl_types.cpp
@@ -34,6 +34,7 @@ extern "C" {
 
 hash_table *glsl_type::array_types = NULL;
 hash_table *glsl_type::record_types = NULL;
+hash_table *glsl_type::interface_types = NULL;
 void *glsl_type::mem_ctx = NULL;
 
 void
@@ -51,7 +52,7 @@ glsl_type::glsl_type(GLenum gl_type,
    gl_type(gl_type),
    base_type(base_type),
    sampler_dimensionality(0), sampler_shadow(0), sampler_array(0),
-   sampler_type(0),
+   sampler_type(0), interface_packing(0),
    vector_elements(vector_elements), matrix_columns(matrix_columns),
    length(0)
 {
@@ -69,7 +70,7 @@ glsl_type::glsl_type(GLenum gl_type,
    gl_type(gl_type),
    base_type(GLSL_TYPE_SAMPLER),
    sampler_dimensionality(dim), sampler_shadow(shadow),
-   sampler_array(array), sampler_type(type),
+   sampler_array(array), sampler_type(type), interface_packing(0),
    vector_elements(0), matrix_columns(0),
    length(0)
 {
@@ -82,7 +83,7 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
 		     const char *name) :
    base_type(GLSL_TYPE_STRUCT),
    sampler_dimensionality(0), sampler_shadow(0), sampler_array(0),
-   sampler_type(0),
+   sampler_type(0), interface_packing(0),
    vector_elements(0), matrix_columns(0),
    length(num_fields)
 {
@@ -96,6 +97,29 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
       this->fields.structure[i].type = fields[i].type;
       this->fields.structure[i].name = ralloc_strdup(this->fields.structure,
 						     fields[i].name);
+      this->fields.structure[i].row_major = fields[i].row_major;
+   }
+}
+
+glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
+		     enum glsl_interface_packing packing, const char *name) :
+   base_type(GLSL_TYPE_INTERFACE),
+   sampler_dimensionality(0), sampler_shadow(0), sampler_array(0),
+   sampler_type(0), interface_packing((unsigned) packing),
+   vector_elements(0), matrix_columns(0),
+   length(num_fields)
+{
+   unsigned int i;
+
+   init_ralloc_type_ctx();
+   this->name = ralloc_strdup(this->mem_ctx, name);
+   this->fields.structure = ralloc_array(this->mem_ctx,
+					 glsl_struct_field, length);
+   for (i = 0; i < length; i++) {
+      this->fields.structure[i].type = fields[i].type;
+      this->fields.structure[i].name = ralloc_strdup(this->fields.structure,
+						     fields[i].name);
+      this->fields.structure[i].row_major = fields[i].row_major;
    }
 }
 
@@ -429,7 +453,7 @@ _mesa_glsl_release_types(void)
 glsl_type::glsl_type(const glsl_type *array, unsigned length) :
    base_type(GLSL_TYPE_ARRAY),
    sampler_dimensionality(0), sampler_shadow(0), sampler_array(0),
-   sampler_type(0),
+   sampler_type(0), interface_packing(0),
    vector_elements(0), matrix_columns(0),
    name(NULL), length(length)
 {
@@ -561,12 +585,18 @@ glsl_type::record_key_compare(const void *a, const void *b)
    if (key1->length != key2->length)
       return 1;
 
+   if (key1->interface_packing != key2->interface_packing)
+      return 1;
+
    for (unsigned i = 0; i < key1->length; i++) {
       if (key1->fields.structure[i].type != key2->fields.structure[i].type)
 	 return 1;
       if (strcmp(key1->fields.structure[i].name,
 		 key2->fields.structure[i].name) != 0)
 	 return 1;
+      if (key1->fields.structure[i].row_major
+         != key2->fields.structure[i].row_major)
+        return 1;
    }
 
    return 0;
@@ -621,9 +651,37 @@ glsl_type::get_record_instance(const glsl_struct_field *fields,
 
 
 const glsl_type *
+glsl_type::get_interface_instance(const glsl_struct_field *fields,
+				  unsigned num_fields,
+				  enum glsl_interface_packing packing,
+				  const char *name)
+{
+   const glsl_type key(fields, num_fields, packing, name);
+
+   if (interface_types == NULL) {
+      interface_types = hash_table_ctor(64, record_key_hash, record_key_compare);
+   }
+
+   const glsl_type *t = (glsl_type *) hash_table_find(interface_types, & key);
+   if (t == NULL) {
+      t = new glsl_type(fields, num_fields, packing, name);
+
+      hash_table_insert(interface_types, (void *) t, t);
+   }
+
+   assert(t->base_type == GLSL_TYPE_INTERFACE);
+   assert(t->length == num_fields);
+   assert(strcmp(t->name, name) == 0);
+
+   return t;
+}
+
+
+const glsl_type *
 glsl_type::field_type(const char *name) const
 {
-   if (this->base_type != GLSL_TYPE_STRUCT)
+   if (this->base_type != GLSL_TYPE_STRUCT
+       && this->base_type != GLSL_TYPE_INTERFACE)
       return error_type;
 
    for (unsigned i = 0; i < this->length; i++) {
@@ -638,7 +696,8 @@ glsl_type::field_type(const char *name) const
 int
 glsl_type::field_index(const char *name) const
 {
-   if (this->base_type != GLSL_TYPE_STRUCT)
+   if (this->base_type != GLSL_TYPE_STRUCT
+       && this->base_type != GLSL_TYPE_INTERFACE)
       return -1;
 
    for (unsigned i = 0; i < this->length; i++) {
@@ -660,7 +719,8 @@ glsl_type::component_slots() const
    case GLSL_TYPE_BOOL:
       return this->components();
 
-   case GLSL_TYPE_STRUCT: {
+   case GLSL_TYPE_STRUCT:
+   case GLSL_TYPE_INTERFACE: {
       unsigned size = 0;
 
       for (unsigned i = 0; i < this->length; i++)
@@ -672,9 +732,13 @@ glsl_type::component_slots() const
    case GLSL_TYPE_ARRAY:
       return this->length * this->fields.array->component_slots();
 
-   default:
-      return 0;
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_VOID:
+   case GLSL_TYPE_ERROR:
+      break;
    }
+
+   return 0;
 }
 
 bool
@@ -799,12 +863,6 @@ glsl_type::std140_base_alignment(bool row_major) const
    return -1;
 }
 
-static unsigned
-align(unsigned val, unsigned align)
-{
-   return (val + align - 1) / align * align;
-}
-
 unsigned
 glsl_type::std140_size(bool row_major) const
 {
@@ -906,11 +964,11 @@ glsl_type::std140_size(bool row_major) const
       for (unsigned i = 0; i < this->length; i++) {
 	 const struct glsl_type *field_type = this->fields.structure[i].type;
 	 unsigned align = field_type->std140_base_alignment(row_major);
-	 size = (size + align - 1) / align * align;
+	 size = glsl_align(size, align);
 	 size += field_type->std140_size(row_major);
       }
-      size = align(size,
-		   this->fields.structure[0].type->std140_base_alignment(row_major));
+      size = glsl_align(size,
+			this->fields.structure[0].type->std140_base_alignment(row_major));
       return size;
    }
 
diff --git a/mesalib/src/glsl/glsl_types.h b/mesalib/src/glsl/glsl_types.h
index d6f5c105e..b0db2bf11 100644
--- a/mesalib/src/glsl/glsl_types.h
+++ b/mesalib/src/glsl/glsl_types.h
@@ -54,6 +54,7 @@ enum glsl_base_type {
    GLSL_TYPE_BOOL,
    GLSL_TYPE_SAMPLER,
    GLSL_TYPE_STRUCT,
+   GLSL_TYPE_INTERFACE,
    GLSL_TYPE_ARRAY,
    GLSL_TYPE_VOID,
    GLSL_TYPE_ERROR
@@ -69,6 +70,12 @@ enum glsl_sampler_dim {
    GLSL_SAMPLER_DIM_EXTERNAL
 };
 
+enum glsl_interface_packing {
+   GLSL_INTERFACE_PACKING_STD140,
+   GLSL_INTERFACE_PACKING_SHARED,
+   GLSL_INTERFACE_PACKING_PACKED
+};
+
 #ifdef __cplusplus
 #include "GL/gl.h"
 #include "ralloc.h"
@@ -84,6 +91,7 @@ struct glsl_type {
 				* only \c GLSL_TYPE_FLOAT, \c GLSL_TYPE_INT,
 				* and \c GLSL_TYPE_UINT are valid.
 				*/
+   unsigned interface_packing:2;
 
    /* Callers of this ralloc-based new need not call delete. It's
     * easier to just ralloc_free 'mem_ctx' (or any of its ancestors). */
@@ -130,8 +138,9 @@ struct glsl_type {
 
    /**
     * For \c GLSL_TYPE_ARRAY, this is the length of the array.  For
-    * \c GLSL_TYPE_STRUCT, it is the number of elements in the structure and
-    * the number of values pointed to by \c fields.structure (below).
+    * \c GLSL_TYPE_STRUCT or \c GLSL_TYPE_INTERFACE, it is the number of
+    * elements in the structure and the number of values pointed to by
+    * \c fields.structure (below).
     */
    unsigned length;
 
@@ -232,6 +241,14 @@ struct glsl_type {
 					       const char *name);
 
    /**
+    * Get the instance of an interface block type
+    */
+   static const glsl_type *get_interface_instance(const glsl_struct_field *fields,
+						  unsigned num_fields,
+						  enum glsl_interface_packing packing,
+						  const char *name);
+
+   /**
     * Query the total number of scalars that make up a scalar, vector or matrix
     */
    unsigned components() const
@@ -394,6 +411,14 @@ struct glsl_type {
    }
 
    /**
+    * Query whether or not a type is an interface
+    */
+   bool is_interface() const
+   {
+      return base_type == GLSL_TYPE_INTERFACE;
+   }
+
+   /**
     * Query whether or not a type is the void type singleton.
     */
    bool is_void() const
@@ -491,6 +516,10 @@ private:
    glsl_type(const glsl_struct_field *fields, unsigned num_fields,
 	     const char *name);
 
+   /** Constructor for interface types */
+   glsl_type(const glsl_struct_field *fields, unsigned num_fields,
+	     enum glsl_interface_packing packing, const char *name);
+
    /** Constructor for array types */
    glsl_type(const glsl_type *array, unsigned length);
 
@@ -500,6 +529,9 @@ private:
    /** Hash table containing the known record types. */
    static struct hash_table *record_types;
 
+   /** Hash table containing the known interface types. */
+   static struct hash_table *interface_types;
+
    static int record_key_compare(const void *a, const void *b);
    static unsigned record_key_hash(const void *key);
 
@@ -566,8 +598,15 @@ private:
 struct glsl_struct_field {
    const struct glsl_type *type;
    const char *name;
+   bool row_major;
 };
 
+static inline unsigned int
+glsl_align(unsigned int a, unsigned int align)
+{
+   return (a + align - 1) / align * align;
+}
+
 #endif /* __cplusplus */
 
 #endif /* GLSL_TYPES_H */
diff --git a/mesalib/src/glsl/hir_field_selection.cpp b/mesalib/src/glsl/hir_field_selection.cpp
index ac416d5da..0035a5f81 100644
--- a/mesalib/src/glsl/hir_field_selection.cpp
+++ b/mesalib/src/glsl/hir_field_selection.cpp
@@ -61,7 +61,8 @@ _mesa_ast_field_selection_to_hir(const ast_expression *expr,
 	 _mesa_glsl_error(& loc, state, "Invalid swizzle / mask `%s'",
 			  expr->primary_expression.identifier);
       }
-   } else if (op->type->base_type == GLSL_TYPE_STRUCT) {
+   } else if (op->type->base_type == GLSL_TYPE_STRUCT
+              || op->type->base_type == GLSL_TYPE_INTERFACE) {
       result = new(ctx) ir_dereference_record(op,
 					      expr->primary_expression.identifier);
 
diff --git a/mesalib/src/glsl/ir.cpp b/mesalib/src/glsl/ir.cpp
index 703f5ec58..954995db3 100644
--- a/mesalib/src/glsl/ir.cpp
+++ b/mesalib/src/glsl/ir.cpp
@@ -306,6 +306,8 @@ ir_expression::ir_expression(int op, ir_rvalue *op0)
       break;
 
    case ir_unop_noise:
+   case ir_unop_unpack_half_2x16_split_x:
+   case ir_unop_unpack_half_2x16_split_y:
       this->type = glsl_type::float_type;
       break;
 
@@ -313,6 +315,25 @@ ir_expression::ir_expression(int op, ir_rvalue *op0)
       this->type = glsl_type::bool_type;
       break;
 
+   case ir_unop_pack_snorm_2x16:
+   case ir_unop_pack_snorm_4x8:
+   case ir_unop_pack_unorm_2x16:
+   case ir_unop_pack_unorm_4x8:
+   case ir_unop_pack_half_2x16:
+      this->type = glsl_type::uint_type;
+      break;
+
+   case ir_unop_unpack_snorm_2x16:
+   case ir_unop_unpack_unorm_2x16:
+   case ir_unop_unpack_half_2x16:
+      this->type = glsl_type::vec2_type;
+      break;
+
+   case ir_unop_unpack_snorm_4x8:
+   case ir_unop_unpack_unorm_4x8:
+      this->type = glsl_type::vec4_type;
+      break;
+
    default:
       assert(!"not reached: missing automatic type setup for ir_expression");
       this->type = op0->type;
@@ -364,10 +385,15 @@ ir_expression::ir_expression(int op, ir_rvalue *op0, ir_rvalue *op1)
    case ir_binop_bit_and:
    case ir_binop_bit_xor:
    case ir_binop_bit_or:
+       assert(!op0->type->is_matrix());
+       assert(!op1->type->is_matrix());
       if (op0->type->is_scalar()) {
-	 this->type = op1->type;
+         this->type = op1->type;
       } else if (op1->type->is_scalar()) {
-	 this->type = op0->type;
+         this->type = op0->type;
+      } else {
+          assert(op0->type->vector_elements == op1->type->vector_elements);
+          this->type = op0->type;
       }
       break;
 
@@ -386,6 +412,10 @@ ir_expression::ir_expression(int op, ir_rvalue *op0, ir_rvalue *op1)
       this->type = glsl_type::float_type;
       break;
 
+   case ir_binop_pack_half_2x16_split:
+      this->type = glsl_type::uint_type;
+      break;
+
    case ir_binop_lshift:
    case ir_binop_rshift:
       this->type = op0->type;
@@ -454,6 +484,18 @@ static const char *const operator_strs[] = {
    "cos_reduced",
    "dFdx",
    "dFdy",
+   "packSnorm2x16",
+   "packSnorm4x8",
+   "packUnorm2x16",
+   "packUnorm4x8",
+   "packHalf2x16",
+   "unpackSnorm2x16",
+   "unpackSnorm4x8",
+   "unpackUnorm2x16",
+   "unpackUnorm4x8",
+   "unpackHalf2x16",
+   "unpackHalf2x16_split_x",
+   "unpackHalf2x16_split_y",
    "noise",
    "+",
    "-",
@@ -480,6 +522,7 @@ static const char *const operator_strs[] = {
    "min",
    "max",
    "pow",
+   "packHalf2x16_split",
    "ubo_load",
    "vector",
 };
@@ -1493,7 +1536,6 @@ ir_variable::ir_variable(const struct glsl_type *type, const char *name,
    this->has_initializer = false;
    this->location = -1;
    this->location_frac = 0;
-   this->uniform_block = -1;
    this->warn_extension = NULL;
    this->constant_value = NULL;
    this->constant_initializer = NULL;
@@ -1553,8 +1595,8 @@ modes_match(unsigned a, unsigned b)
       return true;
 
    /* Accept "in" vs. "const in" */
-   if ((a == ir_var_const_in && b == ir_var_in) ||
-       (b == ir_var_const_in && a == ir_var_in))
+   if ((a == ir_var_const_in && b == ir_var_function_in) ||
+       (b == ir_var_const_in && a == ir_var_function_in))
       return true;
 
    return false;
diff --git a/mesalib/src/glsl/ir.h b/mesalib/src/glsl/ir.h
index 85fc5ce95..efd80dad8 100644
--- a/mesalib/src/glsl/ir.h
+++ b/mesalib/src/glsl/ir.h
@@ -265,9 +265,11 @@ protected:
 enum ir_variable_mode {
    ir_var_auto = 0,     /**< Function local variables and globals. */
    ir_var_uniform,      /**< Variable declared as a uniform. */
-   ir_var_in,
-   ir_var_out,
-   ir_var_inout,
+   ir_var_shader_in,
+   ir_var_shader_out,
+   ir_var_function_in,
+   ir_var_function_out,
+   ir_var_function_inout,
    ir_var_const_in,	/**< "in" param that must be a constant expression */
    ir_var_system_value, /**< Ex: front-face, instance-id, etc. */
    ir_var_temporary	/**< Temporary variable generated during compilation. */
@@ -348,6 +350,41 @@ public:
    glsl_interp_qualifier determine_interpolation_mode(bool flat_shade);
 
    /**
+    * Determine whether or not a variable is part of a uniform block.
+    */
+   inline bool is_in_uniform_block() const
+   {
+      return this->mode == ir_var_uniform && this->interface_type != NULL;
+   }
+
+   /**
+    * Determine whether or not a variable is the declaration of an interface
+    * block
+    *
+    * For the first declaration below, there will be an \c ir_variable named
+    * "instance" whose type and whose instance_type will be the same
+    *  \cglsl_type.  For the second declaration, there will be an \c ir_variable
+    * named "f" whose type is float and whose instance_type is B2.
+    *
+    * "instance" is an interface instance variable, but "f" is not.
+    *
+    * uniform B1 {
+    *     float f;
+    * } instance;
+    *
+    * uniform B2 {
+    *     float f;
+    * };
+    */
+   inline bool is_interface_instance() const
+   {
+      const glsl_type *const t = this->type;
+
+      return (t == this->interface_type)
+         || (t->is_array() && t->fields.array == this->interface_type);
+    }
+
+   /**
     * Declared type of the variable
     */
    const struct glsl_type *type;
@@ -401,7 +438,7 @@ public:
     *
     * \sa ir_variable_mode
     */
-   unsigned mode:3;
+   unsigned mode:4;
 
    /**
     * Interpolation mode for shader inputs / outputs
@@ -481,16 +518,6 @@ public:
    int location;
 
    /**
-    * Uniform block number for uniforms.
-    *
-    * This index is into the shader's list of uniform blocks, not the
-    * linked program's merged list.
-    *
-    * If the variable is not in a uniform block, the value will be -1.
-    */
-   int uniform_block;
-
-   /**
     * output index for dual source blending.
     */
    int index;
@@ -530,6 +557,14 @@ public:
     * objects.
     */
    ir_constant *constant_initializer;
+
+   /**
+    * For variables that are in an interface block or are an instance of an
+    * interface block, this is the \c GLSL_TYPE_INTERFACE type for that block.
+    *
+    * \sa ir_variable::location
+    */
+   const glsl_type *interface_type;
 };
 
 
@@ -908,7 +943,7 @@ public:
    unsigned write_mask:4;
 };
 
-/* Update ir_expression::num_operands() and operator_strs when
+/* Update ir_expression::get_num_operands() and operator_strs when
  * updating this list.
  */
 enum ir_expression_operation {
@@ -969,6 +1004,32 @@ enum ir_expression_operation {
    ir_unop_dFdy,
    /*@}*/
 
+   /**
+    * \name Floating point pack and unpack operations.
+    */
+   /*@{*/
+   ir_unop_pack_snorm_2x16,
+   ir_unop_pack_snorm_4x8,
+   ir_unop_pack_unorm_2x16,
+   ir_unop_pack_unorm_4x8,
+   ir_unop_pack_half_2x16,
+   ir_unop_unpack_snorm_2x16,
+   ir_unop_unpack_snorm_4x8,
+   ir_unop_unpack_unorm_2x16,
+   ir_unop_unpack_unorm_4x8,
+   ir_unop_unpack_half_2x16,
+   /*@}*/
+
+   /**
+    * \name Lowered floating point unpacking operations.
+    *
+    * \see lower_packing_builtins_visitor::split_unpack_half_2x16
+    */
+   /*@{*/
+   ir_unop_unpack_half_2x16_split_x,
+   ir_unop_unpack_half_2x16_split_y,
+   /*@}*/
+
    ir_unop_noise,
 
    /**
@@ -1036,6 +1097,15 @@ enum ir_expression_operation {
    ir_binop_pow,
 
    /**
+    * \name Lowered floating point packing operations.
+    *
+    * \see lower_packing_builtins_visitor::split_pack_half_2x16
+    */
+   /*@{*/
+   ir_binop_pack_half_2x16_split,
+   /*@}*/
+
+   /**
     * Load a value the size of a given GLSL type from a uniform block.
     *
     * operand0 is the ir_constant uniform block index in the linked shader.
diff --git a/mesalib/src/glsl/ir_builder.cpp b/mesalib/src/glsl/ir_builder.cpp
index c62f0b115..8fb30a02a 100644
--- a/mesalib/src/glsl/ir_builder.cpp
+++ b/mesalib/src/glsl/ir_builder.cpp
@@ -188,11 +188,27 @@ ir_expression *mul(operand a, operand b)
    return expr(ir_binop_mul, a, b);
 }
 
+ir_expression *div(operand a, operand b)
+{
+   return expr(ir_binop_div, a, b);
+}
+
+ir_expression *round_even(operand a)
+{
+   return expr(ir_unop_round_even, a);
+}
+
 ir_expression *dot(operand a, operand b)
 {
    return expr(ir_binop_dot, a, b);
 }
 
+ir_expression*
+clamp(operand a, operand b, operand c)
+{
+   return expr(ir_binop_min, expr(ir_binop_max, a, b), c);
+}
+
 ir_expression *
 saturate(operand a)
 {
@@ -203,4 +219,147 @@ saturate(operand a)
 	       new(mem_ctx) ir_constant(0.0f));
 }
 
+ir_expression*
+equal(operand a, operand b)
+{
+   return expr(ir_binop_equal, a, b);
+}
+
+ir_expression*
+less(operand a, operand b)
+{
+   return expr(ir_binop_less, a, b);
+}
+
+ir_expression*
+greater(operand a, operand b)
+{
+   return expr(ir_binop_greater, a, b);
+}
+
+ir_expression*
+lequal(operand a, operand b)
+{
+   return expr(ir_binop_lequal, a, b);
+}
+
+ir_expression*
+gequal(operand a, operand b)
+{
+   return expr(ir_binop_gequal, a, b);
+}
+
+ir_expression*
+logic_not(operand a)
+{
+   return expr(ir_unop_logic_not, a);
+}
+
+ir_expression*
+logic_and(operand a, operand b)
+{
+   return expr(ir_binop_logic_and, a, b);
+}
+
+ir_expression*
+logic_or(operand a, operand b)
+{
+   return expr(ir_binop_logic_or, a, b);
+}
+
+ir_expression*
+bit_not(operand a)
+{
+   return expr(ir_unop_bit_not, a);
+}
+
+ir_expression*
+bit_and(operand a, operand b)
+{
+   return expr(ir_binop_bit_and, a, b);
+}
+
+ir_expression*
+bit_or(operand a, operand b)
+{
+   return expr(ir_binop_bit_or, a, b);
+}
+
+ir_expression*
+lshift(operand a, operand b)
+{
+   return expr(ir_binop_lshift, a, b);
+}
+
+ir_expression*
+rshift(operand a, operand b)
+{
+   return expr(ir_binop_rshift, a, b);
+}
+
+ir_expression*
+f2i(operand a)
+{
+   return expr(ir_unop_f2i, a);
+}
+
+ir_expression*
+i2f(operand a)
+{
+   return expr(ir_unop_i2f, a);
+}
+
+ir_expression*
+i2u(operand a)
+{
+   return expr(ir_unop_i2u, a);
+}
+
+ir_expression*
+u2i(operand a)
+{
+   return expr(ir_unop_u2i, a);
+}
+
+ir_expression*
+f2u(operand a)
+{
+   return expr(ir_unop_f2u, a);
+}
+
+ir_expression*
+u2f(operand a)
+{
+   return expr(ir_unop_u2f, a);
+}
+
+ir_if*
+if_tree(operand condition,
+        ir_instruction *then_branch)
+{
+   assert(then_branch != NULL);
+
+   void *mem_ctx = ralloc_parent(condition.val);
+
+   ir_if *result = new(mem_ctx) ir_if(condition.val);
+   result->then_instructions.push_tail(then_branch);
+   return result;
+}
+
+ir_if*
+if_tree(operand condition,
+        ir_instruction *then_branch,
+        ir_instruction *else_branch)
+{
+   assert(then_branch != NULL);
+   assert(else_branch != NULL);
+
+   void *mem_ctx = ralloc_parent(condition.val);
+
+   ir_if *result = new(mem_ctx) ir_if(condition.val);
+   result->then_instructions.push_tail(then_branch);
+   result->else_instructions.push_tail(else_branch);
+   return result;
+}
+
 } /* namespace ir_builder */
diff --git a/mesalib/src/glsl/ir_builder.h b/mesalib/src/glsl/ir_builder.h
index 067858df4..690ac74eb 100644
--- a/mesalib/src/glsl/ir_builder.h
+++ b/mesalib/src/glsl/ir_builder.h
@@ -25,6 +25,15 @@
 
 namespace ir_builder {
 
+#ifndef WRITEMASK_X
+enum writemask {
+   WRITEMASK_X = 0x1,
+   WRITEMASK_Y = 0x2,
+   WRITEMASK_Z = 0x4,
+   WRITEMASK_W = 0x8,
+};
+#endif
+
 /**
  * This little class exists to let the helper expression generators
  * take either an ir_rvalue * or an ir_variable * to be automatically
@@ -73,9 +82,40 @@ public:
 
 class ir_factory {
 public:
+   ir_factory()
+      : instructions(NULL),
+        mem_ctx(NULL)
+   {
+      return;
+   }
+
    void emit(ir_instruction *ir);
    ir_variable *make_temp(const glsl_type *type, const char *name);
 
+   ir_constant*
+   constant(float f)
+   {
+      return new(mem_ctx) ir_constant(f);
+   }
+
+   ir_constant*
+   constant(int i)
+   {
+      return new(mem_ctx) ir_constant(i);
+   }
+
+   ir_constant*
+   constant(unsigned u)
+   {
+      return new(mem_ctx) ir_constant(u);
+   }
+
+   ir_constant*
+   constant(bool b)
+   {
+      return new(mem_ctx) ir_constant(b);
+   }
+
    exec_list *instructions;
    void *mem_ctx;
 };
@@ -88,9 +128,35 @@ ir_expression *expr(ir_expression_operation op, operand a, operand b);
 ir_expression *add(operand a, operand b);
 ir_expression *sub(operand a, operand b);
 ir_expression *mul(operand a, operand b);
+ir_expression *div(operand a, operand b);
+ir_expression *round_even(operand a);
 ir_expression *dot(operand a, operand b);
+ir_expression *clamp(operand a, operand b, operand c);
 ir_expression *saturate(operand a);
 
+ir_expression *equal(operand a, operand b);
+ir_expression *less(operand a, operand b);
+ir_expression *greater(operand a, operand b);
+ir_expression *lequal(operand a, operand b);
+ir_expression *gequal(operand a, operand b);
+
+ir_expression *logic_not(operand a);
+ir_expression *logic_and(operand a, operand b);
+ir_expression *logic_or(operand a, operand b);
+
+ir_expression *bit_not(operand a);
+ir_expression *bit_or(operand a, operand b);
+ir_expression *bit_and(operand a, operand b);
+ir_expression *lshift(operand a, operand b);
+ir_expression *rshift(operand a, operand b);
+
+ir_expression *f2i(operand a);
+ir_expression *i2f(operand a);
+ir_expression *f2u(operand a);
+ir_expression *u2f(operand a);
+ir_expression *i2u(operand a);
+ir_expression *u2i(operand a);
+
 /**
  * Swizzle away later components, but preserve the ordering.
  */
@@ -108,4 +174,10 @@ ir_swizzle *swizzle_xy(operand a);
 ir_swizzle *swizzle_xyz(operand a);
 ir_swizzle *swizzle_xyzw(operand a);
 
+ir_if *if_tree(operand condition,
+               ir_instruction *then_branch);
+ir_if *if_tree(operand condition,
+               ir_instruction *then_branch,
+               ir_instruction *else_branch);
+
 } /* namespace ir_builder */
diff --git a/mesalib/src/glsl/ir_clone.cpp b/mesalib/src/glsl/ir_clone.cpp
index c62c1fc20..b94ff05df 100644
--- a/mesalib/src/glsl/ir_clone.cpp
+++ b/mesalib/src/glsl/ir_clone.cpp
@@ -50,7 +50,6 @@ ir_variable::clone(void *mem_ctx, struct hash_table *ht) const
    var->interpolation = this->interpolation;
    var->location = this->location;
    var->index = this->index;
-   var->uniform_block = this->uniform_block;
    var->warn_extension = this->warn_extension;
    var->origin_upper_left = this->origin_upper_left;
    var->pixel_center_integer = this->pixel_center_integer;
@@ -77,6 +76,8 @@ ir_variable::clone(void *mem_ctx, struct hash_table *ht) const
       var->constant_initializer =
 	 this->constant_initializer->clone(mem_ctx, ht);
 
+   var->interface_type = this->interface_type;
+
    if (ht) {
       hash_table_insert(ht, var, (void *)const_cast<ir_variable *>(this));
    }
@@ -375,10 +376,15 @@ ir_constant::clone(void *mem_ctx, struct hash_table *ht) const
       return c;
    }
 
-   default:
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_VOID:
+   case GLSL_TYPE_ERROR:
+   case GLSL_TYPE_INTERFACE:
       assert(!"Should not get here.");
-      return NULL;
+      break;
    }
+
+   return NULL;
 }
 
 
diff --git a/mesalib/src/glsl/ir_constant_expression.cpp b/mesalib/src/glsl/ir_constant_expression.cpp
index 17b54b923..86b863f31 100644
--- a/mesalib/src/glsl/ir_constant_expression.cpp
+++ b/mesalib/src/glsl/ir_constant_expression.cpp
@@ -40,25 +40,6 @@
 #include "glsl_types.h"
 #include "program/hash_table.h"
 
-/* Using C99 rounding functions for roundToEven() implementation is
- * difficult, because round(), rint, and nearbyint() are affected by
- * fesetenv(), which the application may have done for its own
- * purposes.  Mesa's IROUND macro is close to what we want, but it
- * rounds away from 0 on n + 0.5.
- */
-static int
-round_to_even(float val)
-{
-   int rounded = IROUND(val);
-
-   if (val - floor(val) == 0.5) {
-      if (rounded % 2 != 0)
-	 rounded += val > 0 ? -1 : 1;
-   }
-
-   return rounded;
-}
-
 static float
 dot(ir_constant *op0, ir_constant *op1)
 {
@@ -94,6 +75,297 @@ bitcast_f2u(float f)
    return u;
 }
 
+/**
+ * Evaluate one component of a floating-point 4x8 unpacking function.
+ */
+typedef uint8_t
+(*pack_1x8_func_t)(float);
+
+/**
+ * Evaluate one component of a floating-point 2x16 unpacking function.
+ */
+typedef uint16_t
+(*pack_1x16_func_t)(float);
+
+/**
+ * Evaluate one component of a floating-point 4x8 unpacking function.
+ */
+typedef float
+(*unpack_1x8_func_t)(uint8_t);
+
+/**
+ * Evaluate one component of a floating-point 2x16 unpacking function.
+ */
+typedef float
+(*unpack_1x16_func_t)(uint16_t);
+
+/**
+ * Evaluate a 2x16 floating-point packing function.
+ */
+static uint32_t
+pack_2x16(pack_1x16_func_t pack_1x16,
+          float x, float y)
+{
+   /* From section 8.4 of the GLSL ES 3.00 spec:
+    *
+    *    packSnorm2x16
+    *    -------------
+    *    The first component of the vector will be written to the least
+    *    significant bits of the output; the last component will be written to
+    *    the most significant bits.
+    *
+    * The specifications for the other packing functions contain similar
+    * language.
+    */
+   uint32_t u = 0;
+   u |= ((uint32_t) pack_1x16(x) << 0);
+   u |= ((uint32_t) pack_1x16(y) << 16);
+   return u;
+}
+
+/**
+ * Evaluate a 4x8 floating-point packing function.
+ */
+static uint32_t
+pack_4x8(pack_1x8_func_t pack_1x8,
+         float x, float y, float z, float w)
+{
+   /* From section 8.4 of the GLSL 4.30 spec:
+    *
+    *    packSnorm4x8
+    *    ------------
+    *    The first component of the vector will be written to the least
+    *    significant bits of the output; the last component will be written to
+    *    the most significant bits.
+    *
+    * The specifications for the other packing functions contain similar
+    * language.
+    */
+   uint32_t u = 0;
+   u |= ((uint32_t) pack_1x8(x) << 0);
+   u |= ((uint32_t) pack_1x8(y) << 8);
+   u |= ((uint32_t) pack_1x8(z) << 16);
+   u |= ((uint32_t) pack_1x8(w) << 24);
+   return u;
+}
+
+/**
+ * Evaluate a 2x16 floating-point unpacking function.
+ */
+static void
+unpack_2x16(unpack_1x16_func_t unpack_1x16,
+            uint32_t u,
+            float *x, float *y)
+{
+    /* From section 8.4 of the GLSL ES 3.00 spec:
+     *
+     *    unpackSnorm2x16
+     *    ---------------
+     *    The first component of the returned vector will be extracted from
+     *    the least significant bits of the input; the last component will be
+     *    extracted from the most significant bits.
+     *
+     * The specifications for the other unpacking functions contain similar
+     * language.
+     */
+   *x = unpack_1x16((uint16_t) (u & 0xffff));
+   *y = unpack_1x16((uint16_t) (u >> 16));
+}
+
+/**
+ * Evaluate a 4x8 floating-point unpacking function.
+ */
+static void
+unpack_4x8(unpack_1x8_func_t unpack_1x8, uint32_t u,
+           float *x, float *y, float *z, float *w)
+{
+    /* From section 8.4 of the GLSL 4.30 spec:
+     *
+     *    unpackSnorm4x8
+     *    --------------
+     *    The first component of the returned vector will be extracted from
+     *    the least significant bits of the input; the last component will be
+     *    extracted from the most significant bits.
+     *
+     * The specifications for the other unpacking functions contain similar
+     * language.
+     */
+   *x = unpack_1x8((uint8_t) (u & 0xff));
+   *y = unpack_1x8((uint8_t) (u >> 8));
+   *z = unpack_1x8((uint8_t) (u >> 16));
+   *w = unpack_1x8((uint8_t) (u >> 24));
+}
+
+/**
+ * Evaluate one component of packSnorm4x8.
+ */
+static uint8_t
+pack_snorm_1x8(float x)
+{
+    /* From section 8.4 of the GLSL 4.30 spec:
+     *
+     *    packSnorm4x8
+     *    ------------
+     *    The conversion for component c of v to fixed point is done as
+     *    follows:
+     *
+     *      packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
+     *
+     * We must first cast the float to an int, because casting a negative
+     * float to a uint is undefined.
+     */
+   return (uint8_t) (int8_t)
+          _mesa_round_to_even(CLAMP(x, -1.0f, +1.0f) * 127.0f);
+}
+
+/**
+ * Evaluate one component of packSnorm2x16.
+ */
+static uint16_t
+pack_snorm_1x16(float x)
+{
+    /* From section 8.4 of the GLSL ES 3.00 spec:
+     *
+     *    packSnorm2x16
+     *    -------------
+     *    The conversion for component c of v to fixed point is done as
+     *    follows:
+     *
+     *      packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
+     *
+     * We must first cast the float to an int, because casting a negative
+     * float to a uint is undefined.
+     */
+   return (uint16_t) (int16_t)
+          _mesa_round_to_even(CLAMP(x, -1.0f, +1.0f) * 32767.0f);
+}
+
+/**
+ * Evaluate one component of unpackSnorm4x8.
+ */
+static float
+unpack_snorm_1x8(uint8_t u)
+{
+    /* From section 8.4 of the GLSL 4.30 spec:
+     *
+     *    unpackSnorm4x8
+     *    --------------
+     *    The conversion for unpacked fixed-point value f to floating point is
+     *    done as follows:
+     *
+     *       unpackSnorm4x8: clamp(f / 127.0, -1, +1)
+     */
+   return CLAMP((int8_t) u / 127.0f, -1.0f, +1.0f);
+}
+
+/**
+ * Evaluate one component of unpackSnorm2x16.
+ */
+static float
+unpack_snorm_1x16(uint16_t u)
+{
+    /* From section 8.4 of the GLSL ES 3.00 spec:
+     *
+     *    unpackSnorm2x16
+     *    ---------------
+     *    The conversion for unpacked fixed-point value f to floating point is
+     *    done as follows:
+     *
+     *       unpackSnorm2x16: clamp(f / 32767.0, -1, +1)
+     */
+   return CLAMP((int16_t) u / 32767.0f, -1.0f, +1.0f);
+}
+
+/**
+ * Evaluate one component packUnorm4x8.
+ */
+static uint8_t
+pack_unorm_1x8(float x)
+{
+    /* From section 8.4 of the GLSL 4.30 spec:
+     *
+     *    packUnorm4x8
+     *    ------------
+     *    The conversion for component c of v to fixed point is done as
+     *    follows:
+     *
+     *       packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
+     */
+   return (uint8_t) _mesa_round_to_even(CLAMP(x, 0.0f, 1.0f) * 255.0f);
+}
+
+/**
+ * Evaluate one component packUnorm2x16.
+ */
+static uint16_t
+pack_unorm_1x16(float x)
+{
+    /* From section 8.4 of the GLSL ES 3.00 spec:
+     *
+     *    packUnorm2x16
+     *    -------------
+     *    The conversion for component c of v to fixed point is done as
+     *    follows:
+     *
+     *       packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
+     */
+   return (uint16_t) _mesa_round_to_even(CLAMP(x, 0.0f, 1.0f) * 65535.0f);
+}
+
+/**
+ * Evaluate one component of unpackUnorm4x8.
+ */
+static float
+unpack_unorm_1x8(uint8_t u)
+{
+    /* From section 8.4 of the GLSL 4.30 spec:
+     *
+     *    unpackUnorm4x8
+     *    --------------
+     *    The conversion for unpacked fixed-point value f to floating point is
+     *    done as follows:
+     *
+     *       unpackUnorm4x8: f / 255.0
+     */
+   return (float) u / 255.0f;
+}
+
+/**
+ * Evaluate one component of unpackUnorm2x16.
+ */
+static float
+unpack_unorm_1x16(uint16_t u)
+{
+    /* From section 8.4 of the GLSL ES 3.00 spec:
+     *
+     *    unpackUnorm2x16
+     *    ---------------
+     *    The conversion for unpacked fixed-point value f to floating point is
+     *    done as follows:
+     *
+     *       unpackUnorm2x16: f / 65535.0
+     */
+   return (float) u / 65535.0f;
+}
+
+/**
+ * Evaluate one component of packHalf2x16.
+ */
+static uint16_t
+pack_half_1x16(float x)
+{
+   return _mesa_float_to_half(x);
+}
+
+/**
+ * Evaluate one component of unpackHalf2x16.
+ */
+static float
+unpack_half_1x16(uint16_t u)
+{
+   return _mesa_half_to_float(u);
+}
+
 ir_constant *
 ir_rvalue::constant_expression_value(struct hash_table *variable_context)
 {
@@ -279,7 +551,7 @@ ir_expression::constant_expression_value(struct hash_table *variable_context)
    case ir_unop_round_even:
       assert(op[0]->type->base_type == GLSL_TYPE_FLOAT);
       for (unsigned c = 0; c < op[0]->type->components(); c++) {
-	 data.f[c] = round_to_even(op[0]->value.f[c]);
+	 data.f[c] = _mesa_round_to_even(op[0]->value.f[c]);
       }
       break;
 
@@ -459,6 +731,70 @@ ir_expression::constant_expression_value(struct hash_table *variable_context)
       }
       break;
 
+   case ir_unop_pack_snorm_2x16:
+      assert(op[0]->type == glsl_type::vec2_type);
+      data.u[0] = pack_2x16(pack_snorm_1x16,
+                            op[0]->value.f[0],
+                            op[0]->value.f[1]);
+      break;
+   case ir_unop_pack_snorm_4x8:
+      assert(op[0]->type == glsl_type::vec4_type);
+      data.u[0] = pack_4x8(pack_snorm_1x8,
+                           op[0]->value.f[0],
+                           op[0]->value.f[1],
+                           op[0]->value.f[2],
+                           op[0]->value.f[3]);
+      break;
+   case ir_unop_unpack_snorm_2x16:
+      assert(op[0]->type == glsl_type::uint_type);
+      unpack_2x16(unpack_snorm_1x16,
+                  op[0]->value.u[0],
+                  &data.f[0], &data.f[1]);
+      break;
+   case ir_unop_unpack_snorm_4x8:
+      assert(op[0]->type == glsl_type::uint_type);
+      unpack_4x8(unpack_snorm_1x8,
+                 op[0]->value.u[0],
+                 &data.f[0], &data.f[1], &data.f[2], &data.f[3]);
+      break;
+   case ir_unop_pack_unorm_2x16:
+      assert(op[0]->type == glsl_type::vec2_type);
+      data.u[0] = pack_2x16(pack_unorm_1x16,
+                            op[0]->value.f[0],
+                            op[0]->value.f[1]);
+      break;
+   case ir_unop_pack_unorm_4x8:
+      assert(op[0]->type == glsl_type::vec4_type);
+      data.u[0] = pack_4x8(pack_unorm_1x8,
+                           op[0]->value.f[0],
+                           op[0]->value.f[1],
+                           op[0]->value.f[2],
+                           op[0]->value.f[3]);
+      break;
+   case ir_unop_unpack_unorm_2x16:
+      assert(op[0]->type == glsl_type::uint_type);
+      unpack_2x16(unpack_unorm_1x16,
+                  op[0]->value.u[0],
+                  &data.f[0], &data.f[1]);
+      break;
+   case ir_unop_unpack_unorm_4x8:
+      assert(op[0]->type == glsl_type::uint_type);
+      unpack_4x8(unpack_unorm_1x8,
+                 op[0]->value.u[0],
+                 &data.f[0], &data.f[1], &data.f[2], &data.f[3]);
+      break;
+   case ir_unop_pack_half_2x16:
+      assert(op[0]->type == glsl_type::vec2_type);
+      data.u[0] = pack_2x16(pack_half_1x16,
+                            op[0]->value.f[0],
+                            op[0]->value.f[1]);
+      break;
+   case ir_unop_unpack_half_2x16:
+      assert(op[0]->type == glsl_type::uint_type);
+      unpack_2x16(unpack_half_1x16,
+                  op[0]->value.u[0],
+                  &data.f[0], &data.f[1]);
+      break;
    case ir_binop_pow:
       assert(op[0]->type->base_type == GLSL_TYPE_FLOAT);
       for (unsigned c = 0; c < op[0]->type->components(); c++) {
diff --git a/mesalib/src/glsl/ir_function.cpp b/mesalib/src/glsl/ir_function.cpp
index a525693ed..fe4209c77 100644
--- a/mesalib/src/glsl/ir_function.cpp
+++ b/mesalib/src/glsl/ir_function.cpp
@@ -78,17 +78,17 @@ parameter_lists_match(const exec_list *list_a, const exec_list *list_b)
 	 return PARAMETER_LIST_NO_MATCH;
 
       case ir_var_const_in:
-      case ir_var_in:
+      case ir_var_function_in:
 	 if (!actual->type->can_implicitly_convert_to(param->type))
 	    return PARAMETER_LIST_NO_MATCH;
 	 break;
 
-      case ir_var_out:
+      case ir_var_function_out:
 	 if (!param->type->can_implicitly_convert_to(actual->type))
 	    return PARAMETER_LIST_NO_MATCH;
 	 break;
 
-      case ir_var_inout:
+      case ir_var_function_inout:
 	 /* Since there are no bi-directional automatic conversions (e.g.,
 	  * there is int -> float but no float -> int), inout parameters must
 	  * be exact matches.
diff --git a/mesalib/src/glsl/ir_optimization.h b/mesalib/src/glsl/ir_optimization.h
index 6b9519174..8f3301840 100644
--- a/mesalib/src/glsl/ir_optimization.h
+++ b/mesalib/src/glsl/ir_optimization.h
@@ -37,6 +37,31 @@
 #define MOD_TO_FRACT       0x20
 #define INT_DIV_TO_MUL_RCP 0x40
 
+/**
+ * \see class lower_packing_builtins_visitor
+ */
+enum lower_packing_builtins_op {
+   LOWER_PACK_UNPACK_NONE               = 0x0000,
+
+   LOWER_PACK_SNORM_2x16                = 0x0001,
+   LOWER_UNPACK_SNORM_2x16              = 0x0002,
+
+   LOWER_PACK_UNORM_2x16                = 0x0004,
+   LOWER_UNPACK_UNORM_2x16              = 0x0008,
+
+   LOWER_PACK_HALF_2x16                 = 0x0010,
+   LOWER_UNPACK_HALF_2x16               = 0x0020,
+
+   LOWER_PACK_HALF_2x16_TO_SPLIT        = 0x0040,
+   LOWER_UNPACK_HALF_2x16_TO_SPLIT      = 0x0080,
+
+   LOWER_PACK_SNORM_4x8                 = 0x0100,
+   LOWER_UNPACK_SNORM_4x8               = 0x0200,
+
+   LOWER_PACK_UNORM_4x8                 = 0x0400,
+   LOWER_UNPACK_UNORM_4x8               = 0x0800,
+};
+
 bool do_common_optimization(exec_list *ir, bool linked,
 			    bool uniform_locations_assigned,
 			    unsigned max_unroll_iterations);
@@ -74,6 +99,7 @@ bool lower_variable_index_to_cond_assign(exec_list *instructions,
 bool lower_quadop_vector(exec_list *instructions, bool dont_lower_swz);
 bool lower_clip_distance(gl_shader *shader);
 void lower_output_reads(exec_list *instructions);
+bool lower_packing_builtins(exec_list *instructions, int op_mask);
 void lower_ubo_reference(struct gl_shader *shader, exec_list *instructions);
 void lower_packed_varyings(void *mem_ctx, unsigned location_base,
                            unsigned locations_used, ir_variable_mode mode,
diff --git a/mesalib/src/glsl/ir_print_visitor.cpp b/mesalib/src/glsl/ir_print_visitor.cpp
index 8aa26e5d0..acc92dbf1 100644
--- a/mesalib/src/glsl/ir_print_visitor.cpp
+++ b/mesalib/src/glsl/ir_print_visitor.cpp
@@ -146,7 +146,8 @@ void ir_print_visitor::visit(ir_variable *ir)
 
    const char *const cent = (ir->centroid) ? "centroid " : "";
    const char *const inv = (ir->invariant) ? "invariant " : "";
-   const char *const mode[] = { "", "uniform ", "in ", "out ", "inout ",
+   const char *const mode[] = { "", "uniform ", "shader_in ", "shader_out ",
+                                "in ", "out ", "inout ",
 			        "const_in ", "sys ", "temporary " };
    const char *const interp[] = { "", "flat", "noperspective" };
 
diff --git a/mesalib/src/glsl/ir_reader.cpp b/mesalib/src/glsl/ir_reader.cpp
index 03dbb67c3..405e75b64 100644
--- a/mesalib/src/glsl/ir_reader.cpp
+++ b/mesalib/src/glsl/ir_reader.cpp
@@ -400,13 +400,17 @@ ir_reader::read_declaration(s_expression *expr)
       } else if (strcmp(qualifier->value(), "auto") == 0) {
 	 var->mode = ir_var_auto;
       } else if (strcmp(qualifier->value(), "in") == 0) {
-	 var->mode = ir_var_in;
+	 var->mode = ir_var_function_in;
+      } else if (strcmp(qualifier->value(), "shader_in") == 0) {
+         var->mode = ir_var_shader_in;
       } else if (strcmp(qualifier->value(), "const_in") == 0) {
 	 var->mode = ir_var_const_in;
       } else if (strcmp(qualifier->value(), "out") == 0) {
-	 var->mode = ir_var_out;
+	 var->mode = ir_var_function_out;
+      } else if (strcmp(qualifier->value(), "shader_out") == 0) {
+	 var->mode = ir_var_shader_out;
       } else if (strcmp(qualifier->value(), "inout") == 0) {
-	 var->mode = ir_var_inout;
+	 var->mode = ir_var_function_inout;
       } else if (strcmp(qualifier->value(), "temporary") == 0) {
 	 var->mode = ir_var_temporary;
       } else if (strcmp(qualifier->value(), "smooth") == 0) {
diff --git a/mesalib/src/glsl/ir_set_program_inouts.cpp b/mesalib/src/glsl/ir_set_program_inouts.cpp
index e5de07e01..1e102bfbb 100644
--- a/mesalib/src/glsl/ir_set_program_inouts.cpp
+++ b/mesalib/src/glsl/ir_set_program_inouts.cpp
@@ -85,7 +85,7 @@ mark(struct gl_program *prog, ir_variable *var, int offset, int len,
 
    for (int i = 0; i < len; i++) {
       GLbitfield64 bitfield = BITFIELD64_BIT(var->location + var->index + offset + i);
-      if (var->mode == ir_var_in) {
+      if (var->mode == ir_var_shader_in) {
 	 prog->InputsRead |= bitfield;
          if (is_fragment_shader) {
             gl_fragment_program *fprog = (gl_fragment_program *) prog;
@@ -152,8 +152,8 @@ ir_set_program_inouts_visitor::visit_enter(ir_dereference_array *ir)
 ir_visitor_status
 ir_set_program_inouts_visitor::visit(ir_variable *ir)
 {
-   if (ir->mode == ir_var_in ||
-       ir->mode == ir_var_out ||
+   if (ir->mode == ir_var_shader_in ||
+       ir->mode == ir_var_shader_out ||
        ir->mode == ir_var_system_value) {
       hash_table_insert(this->ht, ir, ir);
    }
diff --git a/mesalib/src/glsl/ir_validate.cpp b/mesalib/src/glsl/ir_validate.cpp
index ad57a3149..d8cafd55f 100644
--- a/mesalib/src/glsl/ir_validate.cpp
+++ b/mesalib/src/glsl/ir_validate.cpp
@@ -329,6 +329,38 @@ ir_validate::visit_leave(ir_expression *ir)
       assert(ir->operands[0]->type == ir->type);
       break;
 
+   case ir_unop_pack_snorm_2x16:
+   case ir_unop_pack_unorm_2x16:
+   case ir_unop_pack_half_2x16:
+      assert(ir->type == glsl_type::uint_type);
+      assert(ir->operands[0]->type == glsl_type::vec2_type);
+      break;
+
+   case ir_unop_pack_snorm_4x8:
+   case ir_unop_pack_unorm_4x8:
+      assert(ir->type == glsl_type::uint_type);
+      assert(ir->operands[0]->type == glsl_type::vec4_type);
+      break;
+
+   case ir_unop_unpack_snorm_2x16:
+   case ir_unop_unpack_unorm_2x16:
+   case ir_unop_unpack_half_2x16:
+      assert(ir->type == glsl_type::vec2_type);
+      assert(ir->operands[0]->type == glsl_type::uint_type);
+      break;
+
+   case ir_unop_unpack_snorm_4x8:
+   case ir_unop_unpack_unorm_4x8:
+      assert(ir->type == glsl_type::vec4_type);
+      assert(ir->operands[0]->type == glsl_type::uint_type);
+      break;
+
+   case ir_unop_unpack_half_2x16_split_x:
+   case ir_unop_unpack_half_2x16_split_y:
+      assert(ir->type == glsl_type::float_type);
+      assert(ir->operands[0]->type == glsl_type::uint_type);
+      break;
+
    case ir_unop_noise:
       /* XXX what can we assert here? */
       break;
@@ -423,6 +455,12 @@ ir_validate::visit_leave(ir_expression *ir)
       assert(ir->operands[0]->type == ir->operands[1]->type);
       break;
 
+   case ir_binop_pack_half_2x16_split:
+      assert(ir->type == glsl_type::uint_type);
+      assert(ir->operands[0]->type == glsl_type::float_type);
+      assert(ir->operands[1]->type == glsl_type::float_type);
+      break;
+
    case ir_binop_ubo_load:
       assert(ir->operands[0]->as_constant());
       assert(ir->operands[0]->type == glsl_type::uint_type);
@@ -605,8 +643,8 @@ ir_validate::visit_enter(ir_call *ir)
          printf("ir_call parameter type mismatch:\n");
          goto dump_ir;
       }
-      if (formal_param->mode == ir_var_out
-          || formal_param->mode == ir_var_inout) {
+      if (formal_param->mode == ir_var_function_out
+          || formal_param->mode == ir_var_function_inout) {
          if (!actual_param->is_lvalue()) {
             printf("ir_call out/inout parameters must be lvalues:\n");
             goto dump_ir;
diff --git a/mesalib/src/glsl/link_uniform_block_active_visitor.cpp b/mesalib/src/glsl/link_uniform_block_active_visitor.cpp
new file mode 100644
index 000000000..56a8384e9
--- /dev/null
+++ b/mesalib/src/glsl/link_uniform_block_active_visitor.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "link_uniform_block_active_visitor.h"
+#include "program.h"
+
+link_uniform_block_active *
+process_block(void *mem_ctx, struct hash_table *ht, ir_variable *var)
+{
+   const uint32_t h = _mesa_hash_string(var->interface_type->name);
+   const hash_entry *const existing_block =
+      _mesa_hash_table_search(ht, h, var->interface_type->name);
+
+   const glsl_type *const block_type = var->is_interface_instance()
+      ? var->type : var->interface_type;
+
+
+   /* If a block with this block-name has not previously been seen, add it.
+    * If a block with this block-name has been seen, it must be identical to
+    * the block currently being examined.
+    */
+   if (existing_block == NULL) {
+      link_uniform_block_active *const b =
+	 rzalloc(mem_ctx, struct link_uniform_block_active);
+
+      b->type = block_type;
+      b->has_instance_name = var->is_interface_instance();
+
+      _mesa_hash_table_insert(ht, h, var->interface_type->name,
+			      (void *) b);
+      return b;
+   } else {
+      link_uniform_block_active *const b =
+	 (link_uniform_block_active *) existing_block->data;
+
+      if (b->type != block_type
+	  || b->has_instance_name != var->is_interface_instance())
+	 return NULL;
+      else
+	 return b;
+   }
+
+   assert(!"Should not get here.");
+   return NULL;
+}
+
+ir_visitor_status
+link_uniform_block_active_visitor::visit_enter(ir_dereference_array *ir)
+{
+   ir_dereference_variable *const d = ir->array->as_dereference_variable();
+   ir_variable *const var = (d == NULL) ? NULL : d->var;
+
+   /* If the r-value being dereferenced is not a variable (e.g., a field of a
+    * structure) or is not a uniform block instance, continue.
+    *
+    * WARNING: It is not enough for the variable to be part of uniform block.
+    * It must represent the entire block.  Arrays (or matrices) inside blocks
+    * that lack an instance name are handled by the ir_dereference_variable
+    * function.
+    */
+   if (var == NULL
+       || !var->is_in_uniform_block()
+       || !var->is_interface_instance())
+      return visit_continue;
+
+   /* Process the block.  Bail if there was an error.
+    */
+   link_uniform_block_active *const b =
+      process_block(this->mem_ctx, this->ht, var);
+   if (b == NULL) {
+      linker_error(prog,
+		   "uniform block `%s' has mismatching definitions",
+		   var->interface_type->name);
+      this->success = false;
+      return visit_stop;
+   }
+
+   /* Block arrays must be declared with an instance name.
+    */
+   assert(b->has_instance_name);
+   assert((b->num_array_elements == 0) == (b->array_elements == NULL));
+   assert(b->type != NULL);
+
+   /* Determine whether or not this array index has already been added to the
+    * list of active array indices.  At this point all constant folding must
+    * have occured, and the array index must be a constant.
+    */
+   ir_constant *c = ir->array_index->as_constant();
+   assert(c != NULL);
+
+   const unsigned idx = c->get_uint_component(0);
+
+   unsigned i;
+   for (i = 0; i < b->num_array_elements; i++) {
+      if (b->array_elements[i] == idx)
+	 break;
+   }
+
+   assert(i <= b->num_array_elements);
+
+   if (i == b->num_array_elements) {
+      b->array_elements = reralloc(this->mem_ctx,
+				   b->array_elements,
+				   unsigned,
+				   b->num_array_elements + 1);
+
+      b->array_elements[b->num_array_elements] = idx;
+
+      b->num_array_elements++;
+   }
+
+   return visit_continue_with_parent;
+}
+
+ir_visitor_status
+link_uniform_block_active_visitor::visit(ir_dereference_variable *ir)
+{
+   ir_variable *var = ir->var;
+
+   if (!var->is_in_uniform_block())
+      return visit_continue;
+
+   assert(!var->is_interface_instance() || !var->type->is_array());
+
+   /* Process the block.  Bail if there was an error.
+    */
+   link_uniform_block_active *const b =
+      process_block(this->mem_ctx, this->ht, var);
+   if (b == NULL) {
+      linker_error(this->prog,
+		   "uniform block `%s' has mismatching definitions",
+		   var->interface_type->name);
+      this->success = false;
+      return visit_stop;
+   }
+
+   assert(b->num_array_elements == 0);
+   assert(b->array_elements == NULL);
+   assert(b->type != NULL);
+
+   return visit_continue;
+}
diff --git a/mesalib/src/glsl/link_uniform_block_active_visitor.h b/mesalib/src/glsl/link_uniform_block_active_visitor.h
new file mode 100644
index 000000000..fba628a8f
--- /dev/null
+++ b/mesalib/src/glsl/link_uniform_block_active_visitor.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#ifndef LINK_UNIFORM_BLOCK_ACTIVE_VISITOR_H
+#define LINK_UNIFORM_BLOCK_ACTIVE_VISITOR_H
+
+#include "ir.h"
+#include "ir_visitor.h"
+#include "glsl_types.h"
+#include "main/hash_table.h"
+
+struct link_uniform_block_active {
+   const glsl_type *type;
+
+   unsigned *array_elements;
+   unsigned num_array_elements;
+
+   bool has_instance_name;
+};
+
+class link_uniform_block_active_visitor : public ir_hierarchical_visitor {
+public:
+   link_uniform_block_active_visitor(void *mem_ctx, struct hash_table *ht,
+				     struct gl_shader_program *prog)
+      : success(true), prog(prog), ht(ht), mem_ctx(mem_ctx)
+   {
+      /* empty */
+   }
+
+   virtual ir_visitor_status visit_enter(ir_dereference_array *);
+   virtual ir_visitor_status visit(ir_dereference_variable *);
+
+   bool success;
+
+private:
+   struct gl_shader_program *prog;
+   struct hash_table *ht;
+   void *mem_ctx;
+};
+
+#endif /* LINK_UNIFORM_BLOCK_ACTIVE_VISITOR_H */
diff --git a/mesalib/src/glsl/link_uniform_blocks.cpp b/mesalib/src/glsl/link_uniform_blocks.cpp
new file mode 100644
index 000000000..74fe1e29f
--- /dev/null
+++ b/mesalib/src/glsl/link_uniform_blocks.cpp
@@ -0,0 +1,313 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "main/core.h"
+#include "ir.h"
+#include "linker.h"
+#include "ir_uniform.h"
+#include "link_uniform_block_active_visitor.h"
+#include "main/hash_table.h"
+#include "program.h"
+
+class ubo_visitor : public uniform_field_visitor {
+public:
+   ubo_visitor(void *mem_ctx, gl_uniform_buffer_variable *variables,
+               unsigned num_variables)
+      : index(0), offset(0), buffer_size(0), variables(variables),
+        num_variables(num_variables), mem_ctx(mem_ctx), is_array_instance(false)
+   {
+      /* empty */
+   }
+
+   void process(const glsl_type *type, const char *name)
+   {
+      this->offset = 0;
+      this->buffer_size = 0;
+      this->is_array_instance = strchr(name, ']') != NULL;
+      this->uniform_field_visitor::process(type, name);
+   }
+
+   unsigned index;
+   unsigned offset;
+   unsigned buffer_size;
+   gl_uniform_buffer_variable *variables;
+   unsigned num_variables;
+   void *mem_ctx;
+   bool is_array_instance;
+
+private:
+   virtual void visit_field(const glsl_type *type, const char *name,
+                            bool row_major)
+   {
+      assert(this->index < this->num_variables);
+
+      gl_uniform_buffer_variable *v = &this->variables[this->index++];
+
+      v->Name = ralloc_strdup(mem_ctx, name);
+      v->Type = type;
+      v->RowMajor = row_major;
+
+      if (this->is_array_instance) {
+         v->IndexName = ralloc_strdup(mem_ctx, name);
+
+         char *open_bracket = strchr(v->IndexName, '[');
+         assert(open_bracket != NULL);
+
+         char *close_bracket = strchr(open_bracket, ']');
+         assert(close_bracket != NULL);
+
+         /* Length of the tail without the ']' but with the NUL.
+          */
+         unsigned len = strlen(close_bracket + 1) + 1;
+
+         memmove(open_bracket, close_bracket + 1, len);
+     } else {
+         v->IndexName = v->Name;
+      }
+
+      unsigned alignment = type->std140_base_alignment(v->RowMajor);
+      unsigned size = type->std140_size(v->RowMajor);
+
+      this->offset = glsl_align(this->offset, alignment);
+      v->Offset = this->offset;
+      this->offset += size;
+
+      /* From the GL_ARB_uniform_buffer_object spec:
+       *
+       *     "For uniform blocks laid out according to [std140] rules, the
+       *      minimum buffer object size returned by the
+       *      UNIFORM_BLOCK_DATA_SIZE query is derived by taking the offset of
+       *      the last basic machine unit consumed by the last uniform of the
+       *      uniform block (including any end-of-array or end-of-structure
+       *      padding), adding one, and rounding up to the next multiple of
+       *      the base alignment required for a vec4."
+       */
+      this->buffer_size = glsl_align(this->offset, 16);
+   }
+
+   virtual void visit_field(const glsl_struct_field *field)
+   {
+      this->offset = glsl_align(this->offset,
+                                field->type->std140_base_alignment(false));
+   }
+};
+
+class count_block_size : public uniform_field_visitor {
+public:
+   count_block_size() : num_active_uniforms(0)
+   {
+      /* empty */
+   }
+
+   unsigned num_active_uniforms;
+
+private:
+   virtual void visit_field(const glsl_type *type, const char *name,
+                            bool row_major)
+   {
+      (void) type;
+      (void) name;
+      (void) row_major;
+      this->num_active_uniforms++;
+   }
+};
+
+struct block {
+   const glsl_type *type;
+   bool has_instance_name;
+};
+
+int
+link_uniform_blocks(void *mem_ctx,
+                    struct gl_shader_program *prog,
+                    struct gl_shader **shader_list,
+                    unsigned num_shaders,
+                    struct gl_uniform_block **blocks_ret)
+{
+   /* This hash table will track all of the uniform blocks that have been
+    * encountered.  Since blocks with the same block-name must be the same,
+    * the hash is organized by block-name.
+    */
+   struct hash_table *block_hash =
+      _mesa_hash_table_create(mem_ctx, _mesa_key_string_equal);
+
+   /* Determine which uniform blocks are active.
+    */
+   link_uniform_block_active_visitor v(mem_ctx, block_hash, prog);
+   for (unsigned i = 0; i < num_shaders; i++) {
+      visit_list_elements(&v, shader_list[i]->ir);
+   }
+
+   /* Count the number of active uniform blocks.  Count the total number of
+    * active slots in those uniform blocks.
+    */
+   unsigned num_blocks = 0;
+   unsigned num_variables = 0;
+   count_block_size block_size;
+   struct hash_entry *entry;
+
+   hash_table_foreach (block_hash, entry) {
+      const struct link_uniform_block_active *const b =
+         (const struct link_uniform_block_active *) entry->data;
+
+      const glsl_type *const block_type =
+         b->type->is_array() ? b->type->fields.array : b->type;
+
+      assert((b->num_array_elements > 0) == b->type->is_array());
+
+      block_size.num_active_uniforms = 0;
+      block_size.process(block_type, "");
+
+      if (b->num_array_elements > 0) {
+         num_blocks += b->num_array_elements;
+         num_variables += b->num_array_elements
+            * block_size.num_active_uniforms;
+      } else {
+         num_blocks++;
+         num_variables += block_size.num_active_uniforms;
+      }
+
+   }
+
+   if (num_blocks == 0) {
+      assert(num_variables == 0);
+      _mesa_hash_table_destroy(block_hash, NULL);
+      return 0;
+   }
+
+   assert(num_variables != 0);
+
+   /* Allocate storage to hold all of the informatation related to uniform
+    * blocks that can be queried through the API.
+    */
+   gl_uniform_block *blocks =
+      ralloc_array(mem_ctx, gl_uniform_block, num_blocks);
+   gl_uniform_buffer_variable *variables =
+      ralloc_array(blocks, gl_uniform_buffer_variable, num_variables);
+
+   /* Add each variable from each uniform block to the API tracking
+    * structures.
+    */
+   unsigned i = 0;
+   ubo_visitor parcel(blocks, variables, num_variables);
+
+   STATIC_ASSERT(unsigned(GLSL_INTERFACE_PACKING_STD140)
+                 == unsigned(ubo_packing_std140));
+   STATIC_ASSERT(unsigned(GLSL_INTERFACE_PACKING_SHARED)
+                 == unsigned(ubo_packing_shared));
+   STATIC_ASSERT(unsigned(GLSL_INTERFACE_PACKING_PACKED)
+                 == unsigned(ubo_packing_packed));
+
+
+   hash_table_foreach (block_hash, entry) {
+      const struct link_uniform_block_active *const b =
+         (const struct link_uniform_block_active *) entry->data;
+      const glsl_type *block_type = b->type;
+
+      if (b->num_array_elements > 0) {
+         const char *const name = block_type->fields.array->name;
+
+         assert(b->has_instance_name);
+         for (unsigned j = 0; j < b->num_array_elements; j++) {
+            blocks[i].Name = ralloc_asprintf(blocks, "%s[%u]", name,
+                                             b->array_elements[j]);
+            blocks[i].Uniforms = &variables[parcel.index];
+            blocks[i].Binding = 0;
+            blocks[i].UniformBufferSize = 0;
+            blocks[i]._Packing =
+               gl_uniform_block_packing(block_type->interface_packing);
+
+            parcel.process(block_type->fields.array,
+                           blocks[i].Name);
+
+            blocks[i].UniformBufferSize = parcel.buffer_size;
+
+            blocks[i].NumUniforms =
+               (unsigned)(ptrdiff_t)(&variables[parcel.index] - blocks[i].Uniforms);
+
+            i++;
+         }
+      } else {
+         blocks[i].Name = ralloc_strdup(blocks, block_type->name);
+         blocks[i].Uniforms = &variables[parcel.index];
+         blocks[i].Binding = 0;
+         blocks[i].UniformBufferSize = 0;
+         blocks[i]._Packing =
+            gl_uniform_block_packing(block_type->interface_packing);
+
+         parcel.process(block_type,
+                        b->has_instance_name ? block_type->name : "");
+
+         blocks[i].UniformBufferSize = parcel.buffer_size;
+
+         blocks[i].NumUniforms =
+            (unsigned)(ptrdiff_t)(&variables[parcel.index] - blocks[i].Uniforms);
+
+         i++;
+      }
+   }
+
+   assert(parcel.index == num_variables);
+
+   _mesa_hash_table_destroy(block_hash, NULL);
+
+   *blocks_ret = blocks;
+   return num_blocks;
+}
+
+bool
+link_uniform_blocks_are_compatible(const gl_uniform_block *a,
+				   const gl_uniform_block *b)
+{
+   assert(strcmp(a->Name, b->Name) == 0);
+
+   /* Page 35 (page 42 of the PDF) in section 4.3.7 of the GLSL 1.50 spec says:
+    *
+    *     "Matched block names within an interface (as defined above) must
+    *     match in terms of having the same number of declarations with the
+    *     same sequence of types and the same sequence of member names, as
+    *     well as having the same member-wise layout qualification....if a
+    *     matching block is declared as an array, then the array sizes must
+    *     also match... Any mismatch will generate a link error."
+    *
+    * Arrays are not yet supported, so there is no check for that.
+    */
+   if (a->NumUniforms != b->NumUniforms)
+      return false;
+
+   if (a->_Packing != b->_Packing)
+      return false;
+
+   for (unsigned i = 0; i < a->NumUniforms; i++) {
+      if (strcmp(a->Uniforms[i].Name, b->Uniforms[i].Name) != 0)
+	 return false;
+
+      if (a->Uniforms[i].Type != b->Uniforms[i].Type)
+	 return false;
+
+      if (a->Uniforms[i].RowMajor != b->Uniforms[i].RowMajor)
+	 return false;
+   }
+
+   return true;
+}
diff --git a/mesalib/src/glsl/link_uniform_initializers.cpp b/mesalib/src/glsl/link_uniform_initializers.cpp
index 849e08097..836a360fa 100644
--- a/mesalib/src/glsl/link_uniform_initializers.cpp
+++ b/mesalib/src/glsl/link_uniform_initializers.cpp
@@ -67,7 +67,11 @@ copy_constant_to_storage(union gl_constant_value *storage,
       case GLSL_TYPE_BOOL:
 	 storage[i].b = int(val->value.b[i]);
 	 break;
-      default:
+      case GLSL_TYPE_ARRAY:
+      case GLSL_TYPE_STRUCT:
+      case GLSL_TYPE_INTERFACE:
+      case GLSL_TYPE_VOID:
+      case GLSL_TYPE_ERROR:
 	 /* All other types should have already been filtered by other
 	  * paths in the caller.
 	  */
diff --git a/mesalib/src/glsl/link_uniforms.cpp b/mesalib/src/glsl/link_uniforms.cpp
index 07d9c18de..f1284adb2 100644
--- a/mesalib/src/glsl/link_uniforms.cpp
+++ b/mesalib/src/glsl/link_uniforms.cpp
@@ -29,12 +29,6 @@
 #include "program/hash_table.h"
 #include "program.h"
 
-static inline unsigned int
-align(unsigned int a, unsigned int align)
-{
-   return (a + align - 1) / align * align;
-}
-
 /**
  * \file link_uniforms.cpp
  * Assign locations for GLSL uniforms.
@@ -58,23 +52,49 @@ values_for_type(const glsl_type *type)
 }
 
 void
+uniform_field_visitor::process(const glsl_type *type, const char *name)
+{
+   assert(type->is_record()
+          || (type->is_array() && type->fields.array->is_record())
+          || type->is_interface()
+          || (type->is_array() && type->fields.array->is_interface()));
+
+   char *name_copy = ralloc_strdup(NULL, name);
+   recursion(type, &name_copy, strlen(name), false);
+   ralloc_free(name_copy);
+}
+
+void
 uniform_field_visitor::process(ir_variable *var)
 {
    const glsl_type *t = var->type;
 
+   /* false is always passed for the row_major parameter to the other
+    * processing functions because no information is available to do
+    * otherwise.  See the warning in linker.h.
+    */
+
    /* Only strdup the name if we actually will need to modify it. */
    if (t->is_record() || (t->is_array() && t->fields.array->is_record())) {
       char *name = ralloc_strdup(NULL, var->name);
-      recursion(var->type, &name, strlen(name));
+      recursion(var->type, &name, strlen(name), false);
+      ralloc_free(name);
+   } else if (t->is_interface()) {
+      char *name = ralloc_strdup(NULL, var->type->name);
+      recursion(var->type, &name, strlen(name), false);
+      ralloc_free(name);
+   } else if (t->is_array() && t->fields.array->is_interface()) {
+      char *name = ralloc_strdup(NULL, var->type->fields.array->name);
+      recursion(var->type, &name, strlen(name), false);
       ralloc_free(name);
    } else {
-      this->visit_field(t, var->name);
+      this->visit_field(t, var->name, false);
    }
 }
 
 void
 uniform_field_visitor::recursion(const glsl_type *t, char **name,
-				 size_t name_length)
+                                 size_t name_length, bool row_major)
 {
    /* Records need to have each field processed individually.
     *
@@ -82,30 +102,47 @@ uniform_field_visitor::recursion(const glsl_type *t, char **name,
     * individually, then each field of the resulting array elements processed
     * individually.
     */
-   if (t->is_record()) {
+   if (t->is_record() || t->is_interface()) {
       for (unsigned i = 0; i < t->length; i++) {
 	 const char *field = t->fields.structure[i].name;
 	 size_t new_length = name_length;
 
-	 /* Append '.field' to the current uniform name. */
-	 ralloc_asprintf_rewrite_tail(name, &new_length, ".%s", field);
+         if (t->fields.structure[i].type->is_record())
+            this->visit_field(&t->fields.structure[i]);
+
+         /* Append '.field' to the current uniform name. */
+         if (name_length == 0) {
+            ralloc_asprintf_rewrite_tail(name, &new_length, "%s", field);
+         } else {
+            ralloc_asprintf_rewrite_tail(name, &new_length, ".%s", field);
+         }
 
-	 recursion(t->fields.structure[i].type, name, new_length);
+         recursion(t->fields.structure[i].type, name, new_length,
+                   t->fields.structure[i].row_major);
       }
-   } else if (t->is_array() && t->fields.array->is_record()) {
+   } else if (t->is_array() && (t->fields.array->is_record()
+                                || t->fields.array->is_interface())) {
       for (unsigned i = 0; i < t->length; i++) {
 	 size_t new_length = name_length;
 
 	 /* Append the subscript to the current uniform name */
 	 ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", i);
 
-	 recursion(t->fields.array, name, new_length);
+         recursion(t->fields.array, name, new_length,
+                   t->fields.structure[i].row_major);
       }
    } else {
-      this->visit_field(t, *name);
+      this->visit_field(t, *name, row_major);
    }
 }
 
+void
+uniform_field_visitor::visit_field(const glsl_struct_field *field)
+{
+   (void) field;
+   /* empty */
+}
+
 /**
  * Class to help calculate the storage requirements for a set of uniforms
  *
@@ -131,6 +168,15 @@ public:
       this->num_shader_uniform_components = 0;
    }
 
+   void process(ir_variable *var)
+   {
+      if (var->is_interface_instance())
+         uniform_field_visitor::process(var->interface_type,
+                                        var->interface_type->name);
+      else
+         uniform_field_visitor::process(var);
+   }
+
    /**
     * Total number of active uniforms counted
     */
@@ -152,10 +198,15 @@ public:
    unsigned num_shader_uniform_components;
 
 private:
-   virtual void visit_field(const glsl_type *type, const char *name)
+   virtual void visit_field(const glsl_type *type, const char *name,
+                            bool row_major)
    {
       assert(!type->is_record());
       assert(!(type->is_array() && type->fields.array->is_record()));
+      assert(!type->is_interface());
+      assert(!(type->is_array() && type->fields.array->is_interface()));
+
+      (void) row_major;
 
       /* Count the number of samplers regardless of whether the uniform is
        * already in the hash table.  The hash table prevents adding the same
@@ -224,42 +275,77 @@ public:
    }
 
    void set_and_process(struct gl_shader_program *prog,
-			struct gl_shader *shader,
 			ir_variable *var)
    {
-      ubo_var = NULL;
-      if (var->uniform_block != -1) {
-	 struct gl_uniform_block *block =
-	    &shader->UniformBlocks[var->uniform_block];
-
-	 ubo_block_index = -1;
-	 for (unsigned i = 0; i < prog->NumUniformBlocks; i++) {
-	    if (!strcmp(prog->UniformBlocks[i].Name,
-			shader->UniformBlocks[var->uniform_block].Name)) {
-	       ubo_block_index = i;
-	       break;
+      ubo_block_index = -1;
+      if (var->is_in_uniform_block()) {
+         if (var->is_interface_instance() && var->type->is_array()) {
+            unsigned l = strlen(var->interface_type->name);
+
+            for (unsigned i = 0; i < prog->NumUniformBlocks; i++) {
+               if (strncmp(var->interface_type->name,
+                           prog->UniformBlocks[i].Name,
+                           l) == 0
+                   && prog->UniformBlocks[i].Name[l] == '[') {
+                  ubo_block_index = i;
+                  break;
+               }
+            }
+         } else {
+            for (unsigned i = 0; i < prog->NumUniformBlocks; i++) {
+               if (strcmp(var->interface_type->name,
+                          prog->UniformBlocks[i].Name) == 0) {
+                  ubo_block_index = i;
+                  break;
+               }
 	    }
 	 }
 	 assert(ubo_block_index != -1);
 
-	 ubo_var_index = var->location;
-	 ubo_var = &block->Uniforms[var->location];
-	 ubo_byte_offset = ubo_var->Offset;
-      }
-
-      process(var);
+         /* Uniform blocks that were specified with an instance name must be
+          * handled a little bit differently.  The name of the variable is the
+          * name used to reference the uniform block instead of being the name
+          * of a variable within the block.  Therefore, searching for the name
+          * within the block will fail.
+          */
+         if (var->is_interface_instance()) {
+            ubo_byte_offset = 0;
+            ubo_row_major = false;
+         } else {
+            const struct gl_uniform_block *const block =
+               &prog->UniformBlocks[ubo_block_index];
+
+            assert(var->location != -1);
+
+            const struct gl_uniform_buffer_variable *const ubo_var =
+               &block->Uniforms[var->location];
+
+            ubo_row_major = ubo_var->RowMajor;
+            ubo_byte_offset = ubo_var->Offset;
+         }
+
+         if (var->is_interface_instance())
+            process(var->interface_type, var->interface_type->name);
+         else
+            process(var);
+      } else
+         process(var);
    }
 
-   struct gl_uniform_buffer_variable *ubo_var;
    int ubo_block_index;
-   int ubo_var_index;
    int ubo_byte_offset;
+   bool ubo_row_major;
 
 private:
-   virtual void visit_field(const glsl_type *type, const char *name)
+   virtual void visit_field(const glsl_type *type, const char *name,
+                            bool row_major)
    {
       assert(!type->is_record());
       assert(!(type->is_array() && type->fields.array->is_record()));
+      assert(!type->is_interface());
+      assert(!(type->is_array() && type->fields.array->is_interface()));
+
+      (void) row_major;
 
       unsigned id;
       bool found = this->map->get(id, name);
@@ -330,17 +416,17 @@ private:
       this->uniforms[id].num_driver_storage = 0;
       this->uniforms[id].driver_storage = NULL;
       this->uniforms[id].storage = this->values;
-      if (this->ubo_var) {
+      if (this->ubo_block_index != -1) {
 	 this->uniforms[id].block_index = this->ubo_block_index;
 
-	 unsigned alignment = type->std140_base_alignment(ubo_var->RowMajor);
-	 this->ubo_byte_offset = align(this->ubo_byte_offset, alignment);
+	 unsigned alignment = type->std140_base_alignment(ubo_row_major);
+	 this->ubo_byte_offset = glsl_align(this->ubo_byte_offset, alignment);
 	 this->uniforms[id].offset = this->ubo_byte_offset;
-	 this->ubo_byte_offset += type->std140_size(ubo_var->RowMajor);
+	 this->ubo_byte_offset += type->std140_size(ubo_row_major);
 
 	 if (type->is_array()) {
 	    this->uniforms[id].array_stride =
-	       align(type->fields.array->std140_size(ubo_var->RowMajor), 16);
+	       glsl_align(type->fields.array->std140_size(ubo_row_major), 16);
 	 } else {
 	    this->uniforms[id].array_stride = 0;
 	 }
@@ -348,7 +434,7 @@ private:
 	 if (type->is_matrix() ||
 	     (type->is_array() && type->fields.array->is_matrix())) {
 	    this->uniforms[id].matrix_stride = 16;
-	    this->uniforms[id].row_major = ubo_var->RowMajor;
+	    this->uniforms[id].row_major = ubo_row_major;
 	 } else {
 	    this->uniforms[id].matrix_stride = 0;
 	    this->uniforms[id].row_major = false;
@@ -399,26 +485,10 @@ link_cross_validate_uniform_block(void *mem_ctx,
 {
    for (unsigned int i = 0; i < *num_linked_blocks; i++) {
       struct gl_uniform_block *old_block = &(*linked_blocks)[i];
-      if (strcmp(old_block->Name, new_block->Name) == 0) {
-	 if (old_block->NumUniforms != new_block->NumUniforms) {
-	    return -1;
-	 }
 
-	 for (unsigned j = 0; j < old_block->NumUniforms; j++) {
-	    if (strcmp(old_block->Uniforms[j].Name,
-		       new_block->Uniforms[j].Name) != 0)
-	       return -1;
-
-	    if (old_block->Uniforms[j].Offset !=
-		new_block->Uniforms[j].Offset)
-	       return -1;
-
-	    if (old_block->Uniforms[j].RowMajor !=
-		new_block->Uniforms[j].RowMajor)
-	       return -1;
-	 }
-	 return i;
-      }
+      if (strcmp(old_block->Name, new_block->Name) == 0)
+	 return link_uniform_blocks_are_compatible(old_block, new_block)
+	    ? i : -1;
    }
 
    *linked_blocks = reralloc(mem_ctx, *linked_blocks,
@@ -440,7 +510,13 @@ link_cross_validate_uniform_block(void *mem_ctx,
       struct gl_uniform_buffer_variable *ubo_var =
 	 &linked_block->Uniforms[i];
 
-      ubo_var->Name = ralloc_strdup(*linked_blocks, ubo_var->Name);
+      if (ubo_var->Name == ubo_var->IndexName) {
+         ubo_var->Name = ralloc_strdup(*linked_blocks, ubo_var->Name);
+         ubo_var->IndexName = ubo_var->Name;
+      } else {
+         ubo_var->Name = ralloc_strdup(*linked_blocks, ubo_var->Name);
+         ubo_var->IndexName = ralloc_strdup(*linked_blocks, ubo_var->IndexName);
+      }
    }
 
    return linked_block_index;
@@ -458,17 +534,47 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
    foreach_list(node, shader->ir) {
       ir_variable *const var = ((ir_instruction *) node)->as_variable();
 
-      if ((var == NULL) || (var->uniform_block == -1))
+      if ((var == NULL) || !var->is_in_uniform_block())
 	 continue;
 
       assert(var->mode == ir_var_uniform);
 
+      if (var->is_interface_instance()) {
+         var->location = 0;
+         continue;
+      }
+
       bool found = false;
+      char sentinel = '\0';
+
+      if (var->type->is_record()) {
+         sentinel = '.';
+      } else if (var->type->is_array()
+                 && var->type->fields.array->is_record()) {
+         sentinel = '[';
+      }
+
+      const unsigned l = strlen(var->name);
       for (unsigned i = 0; i < shader->NumUniformBlocks; i++) {
 	 for (unsigned j = 0; j < shader->UniformBlocks[i].NumUniforms; j++) {
-	    if (!strcmp(var->name, shader->UniformBlocks[i].Uniforms[j].Name)) {
+            if (sentinel) {
+               const char *begin = shader->UniformBlocks[i].Uniforms[j].Name;
+               const char *end = strchr(begin, sentinel);
+
+               if (end == NULL)
+                  continue;
+
+               if (l != (end - begin))
+                  continue;
+
+               if (strncmp(var->name, begin, l) == 0) {
+                  found = true;
+                  var->location = j;
+                  break;
+               }
+            } else if (!strcmp(var->name,
+                               shader->UniformBlocks[i].Uniforms[j].Name)) {
 	       found = true;
-	       var->uniform_block = i;
 	       var->location = j;
 	       break;
 	    }
@@ -494,7 +600,7 @@ link_assign_uniform_block_offsets(struct gl_shader *shader)
 	 unsigned alignment = type->std140_base_alignment(ubo_var->RowMajor);
 	 unsigned size = type->std140_size(ubo_var->RowMajor);
 
-	 offset = align(offset, alignment);
+	 offset = glsl_align(offset, alignment);
 	 ubo_var->Offset = offset;
 	 offset += size;
       }
@@ -510,7 +616,7 @@ link_assign_uniform_block_offsets(struct gl_shader *shader)
        *      and rounding up to the next multiple of the base
        *      alignment required for a vec4."
        */
-      block->UniformBufferSize = align(offset, 16);
+      block->UniformBufferSize = glsl_align(offset, 16);
    }
 }
 
@@ -538,13 +644,6 @@ link_assign_uniform_locations(struct gl_shader_program *prog)
     */
    memset(prog->SamplerUnits, 0, sizeof(prog->SamplerUnits));
 
-   for (unsigned i = 0; i < MESA_SHADER_TYPES; i++) {
-      if (prog->_LinkedShaders[i] == NULL)
-	 continue;
-
-      link_update_uniform_buffer_variables(prog->_LinkedShaders[i]);
-   }
-
    /* First pass: Count the uniform resources used by the user-defined
     * uniforms.  While this happens, each active uniform will have an index
     * assigned to it.
@@ -557,6 +656,8 @@ link_assign_uniform_locations(struct gl_shader_program *prog)
       if (prog->_LinkedShaders[i] == NULL)
 	 continue;
 
+      link_update_uniform_buffer_variables(prog->_LinkedShaders[i]);
+
       /* Reset various per-shader target counts.
        */
       uniform_size.start_shader();
@@ -620,7 +721,7 @@ link_assign_uniform_locations(struct gl_shader_program *prog)
 	 if (strncmp("gl_", var->name, 3) == 0)
 	    continue;
 
-	 parcel.set_and_process(prog, prog->_LinkedShaders[i], var);
+	 parcel.set_and_process(prog, var);
       }
 
       prog->_LinkedShaders[i]->active_samplers = parcel.shader_samplers_used;
diff --git a/mesalib/src/glsl/link_varyings.cpp b/mesalib/src/glsl/link_varyings.cpp
index 5c27f231e..25681d618 100644
--- a/mesalib/src/glsl/link_varyings.cpp
+++ b/mesalib/src/glsl/link_varyings.cpp
@@ -54,10 +54,7 @@ cross_validate_outputs_to_inputs(struct gl_shader_program *prog,
    foreach_list(node, producer->ir) {
       ir_variable *const var = ((ir_instruction *) node)->as_variable();
 
-      /* FINISHME: For geometry shaders, this should also look for inout
-       * FINISHME: variables.
-       */
-      if ((var == NULL) || (var->mode != ir_var_out))
+      if ((var == NULL) || (var->mode != ir_var_shader_out))
 	 continue;
 
       parameters.add_variable(var);
@@ -71,10 +68,7 @@ cross_validate_outputs_to_inputs(struct gl_shader_program *prog,
    foreach_list(node, consumer->ir) {
       ir_variable *const input = ((ir_instruction *) node)->as_variable();
 
-      /* FINISHME: For geometry shaders, this should also look for inout
-       * FINISHME: variables.
-       */
-      if ((input == NULL) || (input->mode != ir_var_in))
+      if ((input == NULL) || (input->mode != ir_var_shader_in))
 	 continue;
 
       ir_variable *const output = parameters.get_variable(input->name);
@@ -417,8 +411,17 @@ tfeedback_decl::find_output_var(gl_shader_program *prog,
    const char *name = this->is_clip_distance_mesa
       ? "gl_ClipDistanceMESA" : this->var_name;
    ir_variable *var = producer->symbols->get_variable(name);
-   if (var && var->mode == ir_var_out)
+   if (var && var->mode == ir_var_shader_out) {
+      const glsl_type *type = var->type;
+      while (type->base_type == GLSL_TYPE_ARRAY)
+         type = type->fields.array;
+      if (type->base_type == GLSL_TYPE_STRUCT) {
+         linker_error(prog, "Transform feedback of varying structs not "
+                      "implemented yet.");
+         return NULL;
+      }
       return var;
+   }
 
    /* From GL_EXT_transform_feedback:
     *   A program will fail to link if:
@@ -810,16 +813,15 @@ varying_matches::compute_packing_order(ir_variable *var)
 {
    const glsl_type *element_type = var->type;
 
-   /* FINISHME: Support for "varying" records in GLSL 1.50. */
    while (element_type->base_type == GLSL_TYPE_ARRAY) {
       element_type = element_type->fields.array;
    }
 
-   switch (element_type->vector_elements) {
+   switch (element_type->component_slots() % 4) {
    case 1: return PACKING_ORDER_SCALAR;
    case 2: return PACKING_ORDER_VEC2;
    case 3: return PACKING_ORDER_VEC3;
-   case 4: return PACKING_ORDER_VEC4;
+   case 0: return PACKING_ORDER_VEC4;
    default:
       assert(!"Unexpected value of vector_elements");
       return PACKING_ORDER_VEC4;
@@ -854,7 +856,7 @@ is_varying_var(GLenum shaderType, const ir_variable *var)
 {
    /* Only fragment shaders will take a varying variable as an input */
    if (shaderType == GL_FRAGMENT_SHADER &&
-       var->mode == ir_var_in) {
+       var->mode == ir_var_shader_in) {
       switch (var->location) {
       case FRAG_ATTRIB_WPOS:
       case FRAG_ATTRIB_FACE:
@@ -915,13 +917,13 @@ assign_varying_locations(struct gl_context *ctx,
    foreach_list(node, producer->ir) {
       ir_variable *const output_var = ((ir_instruction *) node)->as_variable();
 
-      if ((output_var == NULL) || (output_var->mode != ir_var_out))
+      if ((output_var == NULL) || (output_var->mode != ir_var_shader_out))
 	 continue;
 
       ir_variable *input_var =
 	 consumer ? consumer->symbols->get_variable(output_var->name) : NULL;
 
-      if (input_var && input_var->mode != ir_var_in)
+      if (input_var && input_var->mode != ir_var_shader_in)
          input_var = NULL;
 
       if (input_var) {
@@ -965,11 +967,11 @@ assign_varying_locations(struct gl_context *ctx,
        */
       assert(!ctx->Extensions.EXT_transform_feedback);
    } else {
-      lower_packed_varyings(mem_ctx, producer_base, slots_used, ir_var_out,
-                            producer);
+      lower_packed_varyings(mem_ctx, producer_base, slots_used,
+                            ir_var_shader_out, producer);
       if (consumer) {
-         lower_packed_varyings(mem_ctx, consumer_base, slots_used, ir_var_in,
-                               consumer);
+         lower_packed_varyings(mem_ctx, consumer_base, slots_used,
+                               ir_var_shader_in, consumer);
       }
    }
 
@@ -979,7 +981,7 @@ assign_varying_locations(struct gl_context *ctx,
       foreach_list(node, consumer->ir) {
          ir_variable *const var = ((ir_instruction *) node)->as_variable();
 
-         if ((var == NULL) || (var->mode != ir_var_in))
+         if ((var == NULL) || (var->mode != ir_var_shader_in))
             continue;
 
          if (var->is_unmatched_generic_inout) {
diff --git a/mesalib/src/glsl/linker.cpp b/mesalib/src/glsl/linker.cpp
index 63548e071..63ce178f4 100644
--- a/mesalib/src/glsl/linker.cpp
+++ b/mesalib/src/glsl/linker.cpp
@@ -107,8 +107,8 @@ public:
 	 ir_rvalue *param_rval = (ir_rvalue *)iter.get();
 	 ir_variable *sig_param = (ir_variable *)sig_iter.get();
 
-	 if (sig_param->mode == ir_var_out ||
-	     sig_param->mode == ir_var_inout) {
+	 if (sig_param->mode == ir_var_function_out ||
+	     sig_param->mode == ir_var_function_inout) {
 	    ir_variable *var = param_rval->variable_referenced();
 	    if (var && strcmp(name, var->name) == 0) {
 	       found = true;
@@ -212,10 +212,10 @@ link_invalidate_variable_locations(gl_shader *sh, int input_base,
 
       int base;
       switch (var->mode) {
-      case ir_var_in:
+      case ir_var_shader_in:
          base = input_base;
          break;
-      case ir_var_out:
+      case ir_var_shader_out:
          base = output_base;
          break;
       default:
@@ -393,10 +393,9 @@ mode_string(const ir_variable *var)
    case ir_var_auto:
       return (var->read_only) ? "global constant" : "global variable";
 
-   case ir_var_uniform: return "uniform";
-   case ir_var_in:      return "shader input";
-   case ir_var_out:     return "shader output";
-   case ir_var_inout:   return "shader inout";
+   case ir_var_uniform:    return "uniform";
+   case ir_var_shader_in:  return "shader input";
+   case ir_var_shader_out: return "shader output";
 
    case ir_var_const_in:
    case ir_var_temporary:
@@ -874,7 +873,6 @@ link_intrastage_shaders(void *mem_ctx,
 			unsigned num_shaders)
 {
    struct gl_uniform_block *uniform_blocks = NULL;
-   unsigned num_uniform_blocks = 0;
 
    /* Check that global variables defined in multiple shaders are consistent.
     */
@@ -882,23 +880,11 @@ link_intrastage_shaders(void *mem_ctx,
       return NULL;
 
    /* Check that uniform blocks between shaders for a stage agree. */
-   for (unsigned i = 0; i < num_shaders; i++) {
-      struct gl_shader *sh = shader_list[i];
-
-      for (unsigned j = 0; j < sh->NumUniformBlocks; j++) {
-	 link_assign_uniform_block_offsets(sh);
-
-	 int index = link_cross_validate_uniform_block(mem_ctx,
-						       &uniform_blocks,
-						       &num_uniform_blocks,
-						       &sh->UniformBlocks[j]);
-	 if (index == -1) {
-	    linker_error(prog, "uniform block `%s' has mismatching definitions",
-			 sh->UniformBlocks[j].Name);
-	    return NULL;
-	 }
-      }
-   }
+   const int num_uniform_blocks =
+      link_uniform_blocks(mem_ctx, prog, shader_list, num_shaders,
+                          &uniform_blocks);
+   if (num_uniform_blocks < 0)
+      return NULL;
 
    /* Check that there is only a single definition of each function signature
     * across all shaders.
@@ -1069,8 +1055,8 @@ update_array_sizes(struct gl_shader_program *prog)
 	 ir_variable *const var = ((ir_instruction *) node)->as_variable();
 
 	 if ((var == NULL) || (var->mode != ir_var_uniform &&
-			       var->mode != ir_var_in &&
-			       var->mode != ir_var_out) ||
+			       var->mode != ir_var_shader_in &&
+			       var->mode != ir_var_shader_out) ||
 	     !var->type->is_array())
 	    continue;
 
@@ -1078,7 +1064,7 @@ update_array_sizes(struct gl_shader_program *prog)
 	  * will not be eliminated.  Since we always do std140, just
 	  * don't resize arrays in UBOs.
 	  */
-	 if (var->uniform_block != -1)
+	 if (var->is_in_uniform_block())
 	    continue;
 
 	 unsigned int size = var->max_array_access;
@@ -1206,7 +1192,8 @@ assign_attribute_or_color_locations(gl_shader_program *prog,
       ? (int) VERT_ATTRIB_GENERIC0 : (int) FRAG_RESULT_DATA0;
 
    const enum ir_variable_mode direction =
-      (target_index == MESA_SHADER_VERTEX) ? ir_var_in : ir_var_out;
+      (target_index == MESA_SHADER_VERTEX)
+      ? ir_var_shader_in : ir_var_shader_out;
 
 
    /* Temporary storage for the set of attributes that need locations assigned.
@@ -1428,7 +1415,7 @@ store_fragdepth_layout(struct gl_shader_program *prog)
    foreach_list(node, ir) {
       ir_variable *const var = ((ir_instruction *) node)->as_variable();
 
-      if (var == NULL || var->mode != ir_var_out) {
+      if (var == NULL || var->mode != ir_var_shader_out) {
          continue;
       }
 
@@ -1809,7 +1796,7 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
 
    if (prog->_LinkedShaders[MESA_SHADER_VERTEX] != NULL) {
       demote_shader_inputs_and_outputs(prog->_LinkedShaders[MESA_SHADER_VERTEX],
-				       ir_var_out);
+				       ir_var_shader_out);
 
       /* Eliminate code that is now dead due to unused vertex outputs being
        * demoted.
@@ -1821,9 +1808,8 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
    if (prog->_LinkedShaders[MESA_SHADER_GEOMETRY] != NULL) {
       gl_shader *const sh = prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
 
-      demote_shader_inputs_and_outputs(sh, ir_var_in);
-      demote_shader_inputs_and_outputs(sh, ir_var_inout);
-      demote_shader_inputs_and_outputs(sh, ir_var_out);
+      demote_shader_inputs_and_outputs(sh, ir_var_shader_in);
+      demote_shader_inputs_and_outputs(sh, ir_var_shader_out);
 
       /* Eliminate code that is now dead due to unused geometry outputs being
        * demoted.
@@ -1835,7 +1821,7 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
    if (prog->_LinkedShaders[MESA_SHADER_FRAGMENT] != NULL) {
       gl_shader *const sh = prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
 
-      demote_shader_inputs_and_outputs(sh, ir_var_in);
+      demote_shader_inputs_and_outputs(sh, ir_var_shader_in);
 
       /* Eliminate code that is now dead due to unused fragment inputs being
        * demoted.  This shouldn't actually do anything other than remove
diff --git a/mesalib/src/glsl/linker.h b/mesalib/src/glsl/linker.h
index 67c7f3488..14eb9c1cd 100644
--- a/mesalib/src/glsl/linker.h
+++ b/mesalib/src/glsl/linker.h
@@ -49,6 +49,17 @@ link_cross_validate_uniform_block(void *mem_ctx,
 void
 link_assign_uniform_block_offsets(struct gl_shader *shader);
 
+extern bool
+link_uniform_blocks_are_compatible(const gl_uniform_block *a,
+				   const gl_uniform_block *b);
+
+extern int
+link_uniform_blocks(void *mem_ctx,
+                    struct gl_shader_program *prog,
+                    struct gl_shader **shader_list,
+                    unsigned num_shaders,
+                    struct gl_uniform_block **blocks_ret);
+
 /**
  * Class for processing all of the leaf fields of an uniform
  *
@@ -71,24 +82,60 @@ public:
     * \param var  The uniform variable that is to be processed
     *
     * Calls \c ::visit_field for each leaf of the uniform.
+    *
+    * \warning
+    * This entry should only be used with uniform blocks in cases where the
+    * row / column ordering of matrices in the block does not matter.  For
+    * example, enumerating the names of members of the block, but not for
+    * determining the offsets of members.
     */
    void process(ir_variable *var);
 
+   /**
+    * Begin processing a uniform of a structured type.
+    *
+    * This flavor of \c process should be used to handle structured types
+    * (i.e., structures, interfaces, or arrays there of) that need special
+    * name handling.  A common usage is to handle cases where the block name
+    * (instead of the instance name) is used for an interface block.
+    *
+    * \param type  Type that is to be processed, associated with \c name
+    * \param name  Base name of the structured uniform being processed
+    *
+    * \note
+    * \c type must be \c GLSL_TYPE_RECORD, \c GLSL_TYPE_INTERFACE, or an array
+    * there of.
+    */
+   void process(const glsl_type *type, const char *name);
+
 protected:
    /**
     * Method invoked for each leaf of the uniform
     *
     * \param type  Type of the field.
     * \param name  Fully qualified name of the field.
+    * \param row_major  For a matrix type, is it stored row-major.
+    */
+   virtual void visit_field(const glsl_type *type, const char *name,
+                            bool row_major) = 0;
+
+   /**
+    * Visit a record before visiting its fields
+    *
+    * For structures-of-structures or interfaces-of-structures, this visits
+    * the inner structure before visiting its fields.
+    *
+    * The default implementation does nothing.
     */
-   virtual void visit_field(const glsl_type *type, const char *name) = 0;
+   virtual void visit_field(const glsl_struct_field *field);
 
 private:
    /**
     * \param name_length  Length of the current name \b not including the
     *                     terminating \c NUL character.
     */
-   void recursion(const glsl_type *t, char **name, size_t name_length);
+   void recursion(const glsl_type *t, char **name, size_t name_length,
+                  bool row_major);
 };
 
 void
diff --git a/mesalib/src/glsl/lower_clip_distance.cpp b/mesalib/src/glsl/lower_clip_distance.cpp
index 09bdc36e1..643807de8 100644
--- a/mesalib/src/glsl/lower_clip_distance.cpp
+++ b/mesalib/src/glsl/lower_clip_distance.cpp
@@ -301,8 +301,8 @@ lower_clip_distance_visitor::visit_leave(ir_call *ir)
          this->base_ir->insert_before(temp_clip_distance);
          actual_param->replace_with(
             new(ctx) ir_dereference_variable(temp_clip_distance));
-         if (formal_param->mode == ir_var_in
-             || formal_param->mode == ir_var_inout) {
+         if (formal_param->mode == ir_var_function_in
+             || formal_param->mode == ir_var_function_inout) {
             /* Copy from gl_ClipDistance to the temporary before the call.
              * Since we are going to insert this copy before the current
              * instruction, we need to visit it afterwards to make sure it
@@ -314,8 +314,8 @@ lower_clip_distance_visitor::visit_leave(ir_call *ir)
             this->base_ir->insert_before(new_assignment);
             this->visit_new_assignment(new_assignment);
          }
-         if (formal_param->mode == ir_var_out
-             || formal_param->mode == ir_var_inout) {
+         if (formal_param->mode == ir_var_function_out
+             || formal_param->mode == ir_var_function_inout) {
             /* Copy from the temporary to gl_ClipDistance after the call.
              * Since visit_list_elements() has already decided which
              * instruction it's going to visit next, we need to visit
diff --git a/mesalib/src/glsl/lower_output_reads.cpp b/mesalib/src/glsl/lower_output_reads.cpp
index a6192a517..b93e254ec 100644
--- a/mesalib/src/glsl/lower_output_reads.cpp
+++ b/mesalib/src/glsl/lower_output_reads.cpp
@@ -41,7 +41,7 @@ class output_read_remover : public ir_hierarchical_visitor {
 protected:
    /**
     * A hash table mapping from the original ir_variable shader outputs
-    * (ir_var_out mode) to the new temporaries to be used instead.
+    * (ir_var_shader_out mode) to the new temporaries to be used instead.
     */
    hash_table *replacements;
 
@@ -86,7 +86,7 @@ output_read_remover::~output_read_remover()
 ir_visitor_status
 output_read_remover::visit(ir_dereference_variable *ir)
 {
-   if (ir->var->mode != ir_var_out)
+   if (ir->var->mode != ir_var_shader_out)
       return visit_continue;
 
    ir_variable *temp = (ir_variable *) hash_table_find(replacements, ir->var);
diff --git a/mesalib/src/glsl/lower_packed_varyings.cpp b/mesalib/src/glsl/lower_packed_varyings.cpp
index 9e7f274b7..8a40f5e72 100644
--- a/mesalib/src/glsl/lower_packed_varyings.cpp
+++ b/mesalib/src/glsl/lower_packed_varyings.cpp
@@ -70,6 +70,10 @@
  * This lowering pass also packs flat floats, ints, and uints together, by
  * using ivec4 as the base type of flat "varyings", and using appropriate
  * casts to convert floats and uints into ints.
+ *
+ * This lowering pass also handles varyings whose type is a struct or an array
+ * of struct.  Structs are packed in order and with no gaps, so there may be a
+ * performance penalty due to structure elements being double-parked.
  */
 
 #include "glsl_symbol_table.h"
@@ -135,8 +139,8 @@ private:
    ir_variable **packed_varyings;
 
    /**
-    * Type of varying which is being lowered in this pass (either ir_var_in or
-    * ir_var_out).
+    * Type of varying which is being lowered in this pass (either
+    * ir_var_shader_in or ir_var_shader_out).
     */
    const ir_variable_mode mode;
 
@@ -274,10 +278,20 @@ lower_packed_varyings_visitor::lower_rvalue(ir_rvalue *rvalue,
                                             ir_variable *unpacked_var,
                                             const char *name)
 {
-   /* FINISHME: Support for "varying" records in GLSL 1.50. */
-   assert(!rvalue->type->is_record());
-
-   if (rvalue->type->is_array()) {
+   if (rvalue->type->is_record()) {
+      for (unsigned i = 0; i < rvalue->type->length; i++) {
+         if (i != 0)
+            rvalue = rvalue->clone(this->mem_ctx, NULL);
+         const char *field_name = rvalue->type->fields.structure[i].name;
+         ir_dereference_record *dereference_record = new(this->mem_ctx)
+            ir_dereference_record(rvalue, field_name);
+         char *deref_name
+            = ralloc_asprintf(this->mem_ctx, "%s.%s", name, field_name);
+         fine_location = this->lower_rvalue(dereference_record, fine_location,
+                                            unpacked_var, deref_name);
+      }
+      return fine_location;
+   } else if (rvalue->type->is_array()) {
       /* Arrays are packed/unpacked by considering each array element in
        * sequence.
        */
@@ -336,7 +350,7 @@ lower_packed_varyings_visitor::lower_rvalue(ir_rvalue *rvalue,
                                                           unpacked_var, name));
       ir_swizzle *swizzle = new(this->mem_ctx)
          ir_swizzle(packed_deref, swizzle_values, components);
-      if (this->mode == ir_var_out) {
+      if (this->mode == ir_var_shader_out) {
          ir_assignment *assignment
             = this->bitwise_assign_pack(swizzle, rvalue);
          this->main_instructions->push_tail(assignment);
diff --git a/mesalib/src/glsl/lower_packing_builtins.cpp b/mesalib/src/glsl/lower_packing_builtins.cpp
new file mode 100644
index 000000000..db73c7b0f
--- /dev/null
+++ b/mesalib/src/glsl/lower_packing_builtins.cpp
@@ -0,0 +1,1314 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "ir.h"
+#include "ir_builder.h"
+#include "ir_optimization.h"
+#include "ir_rvalue_visitor.h"
+
+namespace {
+
+using namespace ir_builder;
+
+/**
+ * A visitor that lowers built-in floating-point pack/unpack expressions
+ * such packSnorm2x16.
+ */
+class lower_packing_builtins_visitor : public ir_rvalue_visitor {
+public:
+   /**
+    * \param op_mask is a bitmask of `enum lower_packing_builtins_op`
+    */
+   explicit lower_packing_builtins_visitor(int op_mask)
+      : op_mask(op_mask),
+        progress(false)
+   {
+      /* Mutually exclusive options. */
+      assert(!((op_mask & LOWER_PACK_HALF_2x16) &&
+               (op_mask & LOWER_PACK_HALF_2x16_TO_SPLIT)));
+
+      assert(!((op_mask & LOWER_UNPACK_HALF_2x16) &&
+               (op_mask & LOWER_UNPACK_HALF_2x16_TO_SPLIT)));
+
+      factory.instructions = &factory_instructions;
+   }
+
+   virtual ~lower_packing_builtins_visitor()
+   {
+      assert(factory_instructions.is_empty());
+   }
+
+   bool get_progress() { return progress; }
+
+   void handle_rvalue(ir_rvalue **rvalue)
+   {
+      if (!*rvalue)
+	 return;
+
+      ir_expression *expr = (*rvalue)->as_expression();
+      if (!expr)
+	 return;
+
+      enum lower_packing_builtins_op lowering_op =
+         choose_lowering_op(expr->operation);
+
+      if (lowering_op == LOWER_PACK_UNPACK_NONE)
+         return;
+
+      setup_factory(ralloc_parent(expr));
+
+      ir_rvalue *op0 = expr->operands[0];
+      ralloc_steal(factory.mem_ctx, op0);
+
+      switch (lowering_op) {
+      case LOWER_PACK_SNORM_2x16:
+         *rvalue = lower_pack_snorm_2x16(op0);
+         break;
+      case LOWER_PACK_SNORM_4x8:
+         *rvalue = lower_pack_snorm_4x8(op0);
+         break;
+      case LOWER_PACK_UNORM_2x16:
+         *rvalue = lower_pack_unorm_2x16(op0);
+         break;
+      case LOWER_PACK_UNORM_4x8:
+         *rvalue = lower_pack_unorm_4x8(op0);
+         break;
+      case LOWER_PACK_HALF_2x16:
+         *rvalue = lower_pack_half_2x16(op0);
+         break;
+      case LOWER_PACK_HALF_2x16_TO_SPLIT:
+         *rvalue = split_pack_half_2x16(op0);
+         break;
+      case LOWER_UNPACK_SNORM_2x16:
+         *rvalue = lower_unpack_snorm_2x16(op0);
+         break;
+      case LOWER_UNPACK_SNORM_4x8:
+         *rvalue = lower_unpack_snorm_4x8(op0);
+         break;
+      case LOWER_UNPACK_UNORM_2x16:
+         *rvalue = lower_unpack_unorm_2x16(op0);
+         break;
+      case LOWER_UNPACK_UNORM_4x8:
+         *rvalue = lower_unpack_unorm_4x8(op0);
+         break;
+      case LOWER_UNPACK_HALF_2x16:
+         *rvalue = lower_unpack_half_2x16(op0);
+         break;
+      case LOWER_UNPACK_HALF_2x16_TO_SPLIT:
+         *rvalue = split_unpack_half_2x16(op0);
+         break;
+      case LOWER_PACK_UNPACK_NONE:
+         assert(!"not reached");
+         break;
+      }
+
+      teardown_factory();
+      progress = true;
+   }
+
+private:
+   const int op_mask;
+   bool progress;
+   ir_factory factory;
+   exec_list factory_instructions;
+
+   /**
+    * Determine the needed lowering operation by filtering \a expr_op
+    * through \ref op_mask.
+    */
+   enum lower_packing_builtins_op
+   choose_lowering_op(ir_expression_operation expr_op)
+   {
+      /* C++ regards int and enum as fundamentally different types.
+       * So, we can't simply return from each case; we must cast the return
+       * value.
+       */
+      int result;
+
+      switch (expr_op) {
+      case ir_unop_pack_snorm_2x16:
+         result = op_mask & LOWER_PACK_SNORM_2x16;
+         break;
+      case ir_unop_pack_snorm_4x8:
+         result = op_mask & LOWER_PACK_SNORM_4x8;
+         break;
+      case ir_unop_pack_unorm_2x16:
+         result = op_mask & LOWER_PACK_UNORM_2x16;
+         break;
+      case ir_unop_pack_unorm_4x8:
+         result = op_mask & LOWER_PACK_UNORM_4x8;
+         break;
+      case ir_unop_pack_half_2x16:
+         result = op_mask & (LOWER_PACK_HALF_2x16 | LOWER_PACK_HALF_2x16_TO_SPLIT);
+         break;
+      case ir_unop_unpack_snorm_2x16:
+         result = op_mask & LOWER_UNPACK_SNORM_2x16;
+         break;
+      case ir_unop_unpack_snorm_4x8:
+         result = op_mask & LOWER_UNPACK_SNORM_4x8;
+         break;
+      case ir_unop_unpack_unorm_2x16:
+         result = op_mask & LOWER_UNPACK_UNORM_2x16;
+         break;
+      case ir_unop_unpack_unorm_4x8:
+         result = op_mask & LOWER_UNPACK_UNORM_4x8;
+         break;
+      case ir_unop_unpack_half_2x16:
+         result = op_mask & (LOWER_UNPACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16_TO_SPLIT);
+         break;
+      default:
+         result = LOWER_PACK_UNPACK_NONE;
+         break;
+      }
+
+      return static_cast<enum lower_packing_builtins_op>(result);
+   }
+
+   void
+   setup_factory(void *mem_ctx)
+   {
+      assert(factory.mem_ctx == NULL);
+      assert(factory.instructions->is_empty());
+
+      factory.mem_ctx = mem_ctx;
+   }
+
+   void
+   teardown_factory()
+   {
+      base_ir->insert_before(factory.instructions);
+      assert(factory.instructions->is_empty());
+      factory.mem_ctx = NULL;
+   }
+
+   template <typename T>
+   ir_constant*
+   constant(T x)
+   {
+      return factory.constant(x);
+   }
+
+   /**
+    * \brief Pack two uint16's into a single uint32.
+    *
+    * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
+    * where the least significant bits specify the first element of the pair.
+    * Return the uint32.
+    */
+   ir_rvalue*
+   pack_uvec2_to_uint(ir_rvalue *uvec2_rval)
+   {
+      assert(uvec2_rval->type == glsl_type::uvec2_type);
+
+      /* uvec2 u = UVEC2_RVAL; */
+      ir_variable *u = factory.make_temp(glsl_type::uvec2_type,
+                                          "tmp_pack_uvec2_to_uint");
+      factory.emit(assign(u, uvec2_rval));
+
+      /* return (u.y << 16) | (u.x & 0xffff); */
+      return bit_or(lshift(swizzle_y(u), constant(16u)),
+                    bit_and(swizzle_x(u), constant(0xffffu)));
+   }
+
+   /**
+    * \brief Pack four uint8's into a single uint32.
+    *
+    * Interpret the given uvec4 as a uint32 4-typle. Pack the 4-tuple into a
+    * uint32 where the least significant bits specify the first element of the
+    * 4-tuple. Return the uint32.
+    */
+   ir_rvalue*
+   pack_uvec4_to_uint(ir_rvalue *uvec4_rval)
+   {
+      assert(uvec4_rval->type == glsl_type::uvec4_type);
+
+      /* uvec4 u = UVEC4_RVAL; */
+      ir_variable *u = factory.make_temp(glsl_type::uvec4_type,
+                                          "tmp_pack_uvec4_to_uint");
+      factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu))));
+
+      /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */
+      return bit_or(bit_or(lshift(swizzle_w(u), constant(24u)),
+                           lshift(swizzle_z(u), constant(16u))),
+                    bit_or(lshift(swizzle_y(u), constant(8u)),
+                           swizzle_x(u)));
+   }
+
+   /**
+    * \brief Unpack a uint32 into two uint16's.
+    *
+    * Interpret the given uint32 as a uint16 pair where the uint32's least
+    * significant bits specify the pair's first element. Return the uint16
+    * pair as a uvec2.
+    */
+   ir_rvalue*
+   unpack_uint_to_uvec2(ir_rvalue *uint_rval)
+   {
+      assert(uint_rval->type == glsl_type::uint_type);
+
+      /* uint u = UINT_RVAL; */
+      ir_variable *u = factory.make_temp(glsl_type::uint_type,
+                                          "tmp_unpack_uint_to_uvec2_u");
+      factory.emit(assign(u, uint_rval));
+
+      /* uvec2 u2; */
+      ir_variable *u2 = factory.make_temp(glsl_type::uvec2_type,
+                                           "tmp_unpack_uint_to_uvec2_u2");
+
+      /* u2.x = u & 0xffffu; */
+      factory.emit(assign(u2, bit_and(u, constant(0xffffu)), WRITEMASK_X));
+
+      /* u2.y = u >> 16u; */
+      factory.emit(assign(u2, rshift(u, constant(16u)), WRITEMASK_Y));
+
+      return deref(u2).val;
+   }
+
+   /**
+    * \brief Unpack a uint32 into four uint8's.
+    *
+    * Interpret the given uint32 as a uint8 4-tuple where the uint32's least
+    * significant bits specify the 4-tuple's first element. Return the uint8
+    * 4-tuple as a uvec4.
+    */
+   ir_rvalue*
+   unpack_uint_to_uvec4(ir_rvalue *uint_rval)
+   {
+      assert(uint_rval->type == glsl_type::uint_type);
+
+      /* uint u = UINT_RVAL; */
+      ir_variable *u = factory.make_temp(glsl_type::uint_type,
+                                          "tmp_unpack_uint_to_uvec4_u");
+      factory.emit(assign(u, uint_rval));
+
+      /* uvec4 u4; */
+      ir_variable *u4 = factory.make_temp(glsl_type::uvec4_type,
+                                           "tmp_unpack_uint_to_uvec4_u4");
+
+      /* u4.x = u & 0xffu; */
+      factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));
+
+      /* u4.y = (u >> 8u) & 0xffu; */
+      factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
+                                      constant(0xffu)), WRITEMASK_Y));
+
+      /* u4.z = (u >> 16u) & 0xffu; */
+      factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
+                                      constant(0xffu)), WRITEMASK_Z));
+
+      /* u4.w = (u >> 24u) */
+      factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));
+
+      return deref(u4).val;
+   }
+
+   /**
+    * \brief Lower a packSnorm2x16 expression.
+    *
+    * \param vec2_rval is packSnorm2x16's input
+    * \return packSnorm2x16's output as a uint rvalue
+    */
+   ir_rvalue*
+   lower_pack_snorm_2x16(ir_rvalue *vec2_rval)
+   {
+      /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
+       *
+       *    highp uint packSnorm2x16(vec2 v)
+       *    --------------------------------
+       *    First, converts each component of the normalized floating-point value
+       *    v into 16-bit integer values. Then, the results are packed into the
+       *    returned 32-bit unsigned integer.
+       *
+       *    The conversion for component c of v to fixed point is done as
+       *    follows:
+       *
+       *       packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
+       *
+       *    The first component of the vector will be written to the least
+       *    significant bits of the output; the last component will be written to
+       *    the most significant bits.
+       *
+       * This function generates IR that approximates the following pseudo-GLSL:
+       *
+       *     return pack_uvec2_to_uint(
+       *         uvec2(ivec2(
+       *           round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f))));
+       *
+       * It is necessary to first convert the vec2 to ivec2 rather than directly
+       * converting vec2 to uvec2 because the latter conversion is undefined.
+       * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
+       * convert a negative floating point value to an uint".
+       */
+      assert(vec2_rval->type == glsl_type::vec2_type);
+
+      ir_rvalue *result = pack_uvec2_to_uint(
+            i2u(f2i(round_even(mul(clamp(vec2_rval,
+                                         constant(-1.0f),
+                                         constant(1.0f)),
+                                   constant(32767.0f))))));
+
+      assert(result->type == glsl_type::uint_type);
+      return result;
+   }
+
+   /**
+    * \brief Lower a packSnorm4x8 expression.
+    *
+    * \param vec4_rval is packSnorm4x8's input
+    * \return packSnorm4x8's output as a uint rvalue
+    */
+   ir_rvalue*
+   lower_pack_snorm_4x8(ir_rvalue *vec4_rval)
+   {
+      /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
+       *
+       *    highp uint packSnorm4x8(vec4 v)
+       *    -------------------------------
+       *    First, converts each component of the normalized floating-point value
+       *    v into 8-bit integer values. Then, the results are packed into the
+       *    returned 32-bit unsigned integer.
+       *
+       *    The conversion for component c of v to fixed point is done as
+       *    follows:
+       *
+       *       packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
+       *
+       *    The first component of the vector will be written to the least
+       *    significant bits of the output; the last component will be written to
+       *    the most significant bits.
+       *
+       * This function generates IR that approximates the following pseudo-GLSL:
+       *
+       *     return pack_uvec4_to_uint(
+       *         uvec4(ivec4(
+       *           round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f))));
+       *
+       * It is necessary to first convert the vec4 to ivec4 rather than directly
+       * converting vec4 to uvec4 because the latter conversion is undefined.
+       * From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined to
+       * convert a negative floating point value to an uint".
+       */
+      assert(vec4_rval->type == glsl_type::vec4_type);
+
+      ir_rvalue *result = pack_uvec4_to_uint(
+            i2u(f2i(round_even(mul(clamp(vec4_rval,
+                                         constant(-1.0f),
+                                         constant(1.0f)),
+                                   constant(127.0f))))));
+
+      assert(result->type == glsl_type::uint_type);
+      return result;
+   }
+
+   /**
+    * \brief Lower an unpackSnorm2x16 expression.
+    *
+    * \param uint_rval is unpackSnorm2x16's input
+    * \return unpackSnorm2x16's output as a vec2 rvalue
+    */
+   ir_rvalue*
+   lower_unpack_snorm_2x16(ir_rvalue *uint_rval)
+   {
+      /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
+       *
+       *    highp vec2 unpackSnorm2x16 (highp uint p)
+       *    -----------------------------------------
+       *    First, unpacks a single 32-bit unsigned integer p into a pair of
+       *    16-bit unsigned integers. Then, each component is converted to
+       *    a normalized floating-point value to generate the returned
+       *    two-component vector.
+       *
+       *    The conversion for unpacked fixed-point value f to floating point is
+       *    done as follows:
+       *
+       *       unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
+       *
+       *    The first component of the returned vector will be extracted from the
+       *    least significant bits of the input; the last component will be
+       *    extracted from the most significant bits.
+       *
+       * This function generates IR that approximates the following pseudo-GLSL:
+       *
+       *    return clamp(
+       *       ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f,
+       *       -1.0f, 1.0f);
+       *
+       * The above IR may appear unnecessarily complex, but the intermediate
+       * conversion to ivec2 and the bit shifts are necessary to correctly unpack
+       * negative floats.
+       *
+       * To see why, consider packing and then unpacking vec2(-1.0, 0.0).
+       * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
+       * place that int16 into an int32, which results in the *positive* integer
+       * 0x0000ffff.  The int16's sign bit becomes, in the int32, the rather
+       * unimportant bit 16. We must now extend the int16's sign bit into bits
+       * 17-32, which is accomplished by left-shifting then right-shifting.
+       */
+
+      assert(uint_rval->type == glsl_type::uint_type);
+
+      ir_rvalue *result =
+        clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
+                                    constant(16)),
+                             constant(16u))),
+                  constant(32767.0f)),
+              constant(-1.0f),
+              constant(1.0f));
+
+      assert(result->type == glsl_type::vec2_type);
+      return result;
+   }
+
+   /**
+    * \brief Lower an unpackSnorm4x8 expression.
+    *
+    * \param uint_rval is unpackSnorm4x8's input
+    * \return unpackSnorm4x8's output as a vec4 rvalue
+    */
+   ir_rvalue*
+   lower_unpack_snorm_4x8(ir_rvalue *uint_rval)
+   {
+      /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
+       *
+       *    highp vec4 unpackSnorm4x8 (highp uint p)
+       *    ----------------------------------------
+       *    First, unpacks a single 32-bit unsigned integer p into four
+       *    8-bit unsigned integers. Then, each component is converted to
+       *    a normalized floating-point value to generate the returned
+       *    four-component vector.
+       *
+       *    The conversion for unpacked fixed-point value f to floating point is
+       *    done as follows:
+       *
+       *       unpackSnorm4x8: clamp(f / 127.0, -1, +1)
+       *
+       *    The first component of the returned vector will be extracted from the
+       *    least significant bits of the input; the last component will be
+       *    extracted from the most significant bits.
+       *
+       * This function generates IR that approximates the following pseudo-GLSL:
+       *
+       *    return clamp(
+       *       ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / 127.0f,
+       *       -1.0f, 1.0f);
+       *
+       * The above IR may appear unnecessarily complex, but the intermediate
+       * conversion to ivec4 and the bit shifts are necessary to correctly unpack
+       * negative floats.
+       *
+       * To see why, consider packing and then unpacking vec4(-1.0, 0.0, 0.0,
+       * 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During unpacking, we
+       * place that int8 into an int32, which results in the *positive* integer
+       * 0x000000ff.  The int8's sign bit becomes, in the int32, the rather
+       * unimportant bit 8. We must now extend the int8's sign bit into bits
+       * 9-32, which is accomplished by left-shifting then right-shifting.
+       */
+
+      assert(uint_rval->type == glsl_type::uint_type);
+
+      ir_rvalue *result =
+        clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
+                                    constant(24u)),
+                             constant(24u))),
+                  constant(127.0f)),
+              constant(-1.0f),
+              constant(1.0f));
+
+      assert(result->type == glsl_type::vec4_type);
+      return result;
+   }
+
+   /**
+    * \brief Lower a packUnorm2x16 expression.
+    *
+    * \param vec2_rval is packUnorm2x16's input
+    * \return packUnorm2x16's output as a uint rvalue
+    */
+   ir_rvalue*
+   lower_pack_unorm_2x16(ir_rvalue *vec2_rval)
+   {
+      /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
+       *
+       *    highp uint packUnorm2x16 (vec2 v)
+       *    ---------------------------------
+       *    First, converts each component of the normalized floating-point value
+       *    v into 16-bit integer values. Then, the results are packed into the
+       *    returned 32-bit unsigned integer.
+       *
+       *    The conversion for component c of v to fixed point is done as
+       *    follows:
+       *
+       *       packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
+       *
+       *    The first component of the vector will be written to the least
+       *    significant bits of the output; the last component will be written to
+       *    the most significant bits.
+       *
+       * This function generates IR that approximates the following pseudo-GLSL:
+       *
+       *     return pack_uvec2_to_uint(uvec2(
+       *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f)));
+       *
+       * Here it is safe to directly convert the vec2 to uvec2 because the the
+       * vec2 has been clamped to a non-negative range.
+       */
+
+      assert(vec2_rval->type == glsl_type::vec2_type);
+
+      ir_rvalue *result = pack_uvec2_to_uint(
+         f2u(round_even(mul(saturate(vec2_rval), constant(65535.0f)))));
+
+      assert(result->type == glsl_type::uint_type);
+      return result;
+   }
+
+   /**
+    * \brief Lower a packUnorm4x8 expression.
+    *
+    * \param vec4_rval is packUnorm4x8's input
+    * \return packUnorm4x8's output as a uint rvalue
+    */
+   ir_rvalue*
+   lower_pack_unorm_4x8(ir_rvalue *vec4_rval)
+   {
+      /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
+       *
+       *    highp uint packUnorm4x8 (vec4 v)
+       *    --------------------------------
+       *    First, converts each component of the normalized floating-point value
+       *    v into 8-bit integer values. Then, the results are packed into the
+       *    returned 32-bit unsigned integer.
+       *
+       *    The conversion for component c of v to fixed point is done as
+       *    follows:
+       *
+       *       packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
+       *
+       *    The first component of the vector will be written to the least
+       *    significant bits of the output; the last component will be written to
+       *    the most significant bits.
+       *
+       * This function generates IR that approximates the following pseudo-GLSL:
+       *
+       *     return pack_uvec4_to_uint(uvec4(
+       *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f)));
+       *
+       * Here it is safe to directly convert the vec4 to uvec4 because the the
+       * vec4 has been clamped to a non-negative range.
+       */
+
+      assert(vec4_rval->type == glsl_type::vec4_type);
+
+      ir_rvalue *result = pack_uvec4_to_uint(
+         f2u(round_even(mul(saturate(vec4_rval), constant(255.0f)))));
+
+      assert(result->type == glsl_type::uint_type);
+      return result;
+   }
+
+   /**
+    * \brief Lower an unpackUnorm2x16 expression.
+    *
+    * \param uint_rval is unpackUnorm2x16's input
+    * \return unpackUnorm2x16's output as a vec2 rvalue
+    */
+   ir_rvalue*
+   lower_unpack_unorm_2x16(ir_rvalue *uint_rval)
+   {
+      /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
+       *
+       *    highp vec2 unpackUnorm2x16 (highp uint p)
+       *    -----------------------------------------
+       *    First, unpacks a single 32-bit unsigned integer p into a pair of
+       *    16-bit unsigned integers. Then, each component is converted to
+       *    a normalized floating-point value to generate the returned
+       *    two-component vector.
+       *
+       *    The conversion for unpacked fixed-point value f to floating point is
+       *    done as follows:
+       *
+       *       unpackUnorm2x16: f / 65535.0
+       *
+       *    The first component of the returned vector will be extracted from the
+       *    least significant bits of the input; the last component will be
+       *    extracted from the most significant bits.
+       *
+       * This function generates IR that approximates the following pseudo-GLSL:
+       *
+       *     return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0;
+       */
+
+      assert(uint_rval->type == glsl_type::uint_type);
+
+      ir_rvalue *result = div(u2f(unpack_uint_to_uvec2(uint_rval)),
+                              constant(65535.0f));
+
+      assert(result->type == glsl_type::vec2_type);
+      return result;
+   }
+
+   /**
+    * \brief Lower an unpackUnorm4x8 expression.
+    *
+    * \param uint_rval is unpackUnorm4x8's input
+    * \return unpackUnorm4x8's output as a vec4 rvalue
+    */
+   ir_rvalue*
+   lower_unpack_unorm_4x8(ir_rvalue *uint_rval)
+   {
+      /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
+       *
+       *    highp vec4 unpackUnorm4x8 (highp uint p)
+       *    ----------------------------------------
+       *    First, unpacks a single 32-bit unsigned integer p into four
+       *    8-bit unsigned integers. Then, each component is converted to
+       *    a normalized floating-point value to generate the returned
+       *    two-component vector.
+       *
+       *    The conversion for unpacked fixed-point value f to floating point is
+       *    done as follows:
+       *
+       *       unpackUnorm4x8: f / 255.0
+       *
+       *    The first component of the returned vector will be extracted from the
+       *    least significant bits of the input; the last component will be
+       *    extracted from the most significant bits.
+       *
+       * This function generates IR that approximates the following pseudo-GLSL:
+       *
+       *     return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0;
+       */
+
+      assert(uint_rval->type == glsl_type::uint_type);
+
+      ir_rvalue *result = div(u2f(unpack_uint_to_uvec4(uint_rval)),
+                              constant(255.0f));
+
+      assert(result->type == glsl_type::vec4_type);
+      return result;
+   }
+
+   /**
+    * \brief Lower the component-wise calculation of packHalf2x16.
+    *
+    * \param f_rval is one component of packHafl2x16's input
+    * \param e_rval is the unshifted exponent bits of f_rval
+    * \param m_rval is the unshifted mantissa bits of f_rval
+    *
+    * \return a uint rvalue that encodes a float16 in its lower 16 bits
+    */
+   ir_rvalue*
+   pack_half_1x16_nosign(ir_rvalue *f_rval,
+                         ir_rvalue *e_rval,
+                         ir_rvalue *m_rval)
+   {
+      assert(e_rval->type == glsl_type::uint_type);
+      assert(m_rval->type == glsl_type::uint_type);
+
+      /* uint u16; */
+      ir_variable *u16 = factory.make_temp(glsl_type::uint_type,
+                                           "tmp_pack_half_1x16_u16");
+
+      /* float f = FLOAT_RVAL; */
+      ir_variable *f = factory.make_temp(glsl_type::float_type,
+                                          "tmp_pack_half_1x16_f");
+      factory.emit(assign(f, f_rval));
+
+      /* uint e = E_RVAL; */
+      ir_variable *e = factory.make_temp(glsl_type::uint_type,
+                                          "tmp_pack_half_1x16_e");
+      factory.emit(assign(e, e_rval));
+
+      /* uint m = M_RVAL; */
+      ir_variable *m = factory.make_temp(glsl_type::uint_type,
+                                          "tmp_pack_half_1x16_m");
+      factory.emit(assign(m, m_rval));
+
+      /* Preliminaries
+       * -------------
+       *
+       * For a float16, the bit layout is:
+       *
+       *   sign:     15
+       *   exponent: 10:14
+       *   mantissa: 0:9
+       *
+       * Let f16 be a float16 value. The sign, exponent, and mantissa
+       * determine its value thus:
+       *
+       *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
+       *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
+       *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
+       *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
+       *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
+       *
+       * where 0 <= m16 < 2^10.
+       *
+       * For a float32, the bit layout is:
+       *
+       *   sign:     31
+       *   exponent: 23:30
+       *   mantissa: 0:22
+       *
+       * Let f32 be a float32 value. The sign, exponent, and mantissa
+       * determine its value thus:
+       *
+       *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
+       *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
+       *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
+       *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
+       *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
+       *
+       * where 0 <= m32 < 2^23.
+       *
+       * The minimum and maximum normal float16 values are
+       *
+       *   min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14)   (20)
+       *   max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10)         (21)
+       *
+       * The step at max_norm16 is
+       *
+       *   max_step16 = 2^5                                     (22)
+       *
+       * Observe that the float16 boundary values in equations 20-21 lie in the
+       * range of normal float32 values.
+       *
+       *
+       * Rounding Behavior
+       * -----------------
+       * Not all float32 values can be exactly represented as a float16. We
+       * round all such intermediate float32 values to the nearest float16; if
+       * the float32 is exactly between to float16 values, we round to the one
+       * with an even mantissa. This rounding behavior has several benefits:
+       *
+       *   - It has no sign bias.
+       *
+       *   - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
+       *     GPU ISA.
+       *
+       *   - By reproducing the behavior of the GPU (at least on Intel hardware),
+       *     compile-time evaluation of constant packHalf2x16 GLSL expressions will
+       *     result in the same value as if the expression were executed on the
+       *     GPU.
+       *
+       * Calculation
+       * -----------
+       * Our task is to compute s16, e16, m16 given f32.  Since this function
+       * ignores the sign bit, assume that s32 = s16 = 0.  There are several
+       * cases consider.
+       */
+
+      factory.emit(
+
+         /* Case 1) f32 is NaN
+          *
+          *   The resultant f16 will also be NaN.
+          */
+
+         /* if (e32 == 255 && m32 != 0) { */
+         if_tree(logic_and(equal(e, constant(0xffu << 23u)),
+                           logic_not(equal(m, constant(0u)))),
+
+            assign(u16, constant(0x7fffu)),
+
+         /* Case 2) f32 lies in the range [0, min_norm16).
+          *
+          *   The resultant float16 will be either zero, subnormal, or normal.
+          *
+          *   Solving
+          *
+          *     f32 = min_norm16       (30)
+          *
+          *   gives
+          *
+          *     e32 = 113 and m32 = 0  (31)
+          *
+          *   Therefore this case occurs if and only if
+          *
+          *     e32 < 113              (32)
+          */
+
+         /* } else if (e32 < 113) { */
+         if_tree(less(e, constant(113u << 23u)),
+
+            /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */
+            assign(u16, f2u(round_even(mul(expr(ir_unop_abs, f),
+                                           constant((float) (1 << 24)))))),
+
+         /* Case 3) f32 lies in the range
+          *         [min_norm16, max_norm16 + max_step16).
+          *
+          *   The resultant float16 will be either normal or infinite.
+          *
+          *   Solving
+          *
+          *     f32 = max_norm16 + max_step16           (40)
+          *         = 2^15 * (1 + 1023 / 2^10) + 2^5    (41)
+          *         = 2^16                              (42)
+          *   gives
+          *
+          *     e32 = 143 and m32 = 0                   (43)
+          *
+          *   We already solved the boundary condition f32 = min_norm16 above
+          *   in equation 31. Therefore this case occurs if and only if
+          *
+          *     113 <= e32 and e32 < 143
+          */
+
+         /* } else if (e32 < 143) { */
+         if_tree(less(e, constant(143u << 23u)),
+
+            /* The addition below handles the case where the mantissa rounds
+             * up to 1024 and bumps the exponent.
+             *
+             * u16 = ((e - (112u << 23u)) >> 13u)
+             *     + round_to_even((float(m) / (1u << 13u));
+             */
+            assign(u16, add(rshift(sub(e, constant(112u << 23u)),
+                                   constant(13u)),
+                            f2u(round_even(
+                                  div(u2f(m), constant((float) (1 << 13))))))),
+
+         /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf].
+          *
+          *   The resultant float16 will be infinite.
+          *
+          *   The cases above caught all float32 values in the range
+          *   [0, max_norm16 + max_step16), so this is the fall-through case.
+          */
+
+         /* } else { */
+
+            assign(u16, constant(31u << 10u))))));
+
+         /* } */
+
+       return deref(u16).val;
+   }
+
+   /**
+    * \brief Lower a packHalf2x16 expression.
+    *
+    * \param vec2_rval is packHalf2x16's input
+    * \return packHalf2x16's output as a uint rvalue
+    */
+   ir_rvalue*
+   lower_pack_half_2x16(ir_rvalue *vec2_rval)
+   {
+      /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
+       *
+       *    highp uint packHalf2x16 (mediump vec2 v)
+       *    ----------------------------------------
+       *    Returns an unsigned integer obtained by converting the components of
+       *    a two-component floating-point vector to the 16-bit floating-point
+       *    representation found in the OpenGL ES Specification, and then packing
+       *    these two 16-bit integers into a 32-bit unsigned integer.
+       *
+       *    The first vector component specifies the 16 least- significant bits
+       *    of the result; the second component specifies the 16 most-significant
+       *    bits.
+       */
+
+      assert(vec2_rval->type == glsl_type::vec2_type);
+
+      /* vec2 f = VEC2_RVAL; */
+      ir_variable *f = factory.make_temp(glsl_type::vec2_type,
+                                         "tmp_pack_half_2x16_f");
+      factory.emit(assign(f, vec2_rval));
+
+      /* uvec2 f32 = bitcast_f2u(f); */
+      ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
+                                            "tmp_pack_half_2x16_f32");
+      factory.emit(assign(f32, expr(ir_unop_bitcast_f2u, f)));
+
+      /* uvec2 f16; */
+      ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
+                                        "tmp_pack_half_2x16_f16");
+
+      /* Get f32's unshifted exponent bits.
+       *
+       *   uvec2 e = f32 & 0x7f800000u;
+       */
+      ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
+                                          "tmp_pack_half_2x16_e");
+      factory.emit(assign(e, bit_and(f32, constant(0x7f800000u))));
+
+      /* Get f32's unshifted mantissa bits.
+       *
+       *   uvec2 m = f32 & 0x007fffffu;
+       */
+      ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
+                                          "tmp_pack_half_2x16_m");
+      factory.emit(assign(m, bit_and(f32, constant(0x007fffffu))));
+
+      /* Set f16's exponent and mantissa bits.
+       *
+       *   f16.x = pack_half_1x16_nosign(e.x, m.x);
+       *   f16.y = pack_half_1y16_nosign(e.y, m.y);
+       */
+      factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_x(f),
+                                                     swizzle_x(e),
+                                                     swizzle_x(m)),
+                           WRITEMASK_X));
+      factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_y(f),
+                                                     swizzle_y(e),
+                                                     swizzle_y(m)),
+                           WRITEMASK_Y));
+
+      /* Set f16's sign bits.
+       *
+       *   f16 |= (f32 & (1u << 31u) >> 16u;
+       */
+      factory.emit(
+         assign(f16, bit_or(f16,
+                            rshift(bit_and(f32, constant(1u << 31u)),
+                                   constant(16u)))));
+
+
+      /* return (f16.y << 16u) | f16.x; */
+      ir_rvalue *result = bit_or(lshift(swizzle_y(f16),
+                                        constant(16u)),
+                                 swizzle_x(f16));
+
+      assert(result->type == glsl_type::uint_type);
+      return result;
+   }
+
+   /**
+    * \brief Split packHalf2x16's vec2 operand into two floats.
+    *
+    * \param vec2_rval is packHalf2x16's input
+    * \return a uint rvalue
+    *
+    * Some code generators, such as the i965 fragment shader, require that all
+    * vector expressions be lowered to a sequence of scalar expressions.
+    * However, packHalf2x16 cannot be scalarized by the same mechanism as
+    * a true vector operation because its input and output have a differing
+    * number of vector components.
+    *
+    * This method scalarizes packHalf2x16 by transforming it from an unary
+    * operation having vector input to a binary operation having scalar input.
+    * That is, it transforms
+    *
+    *    packHalf2x16(VEC2_RVAL);
+    *
+    * into
+    *
+    *    vec2 v = VEC2_RVAL;
+    *    return packHalf2x16_split(v.x, v.y);
+    */
+   ir_rvalue*
+   split_pack_half_2x16(ir_rvalue *vec2_rval)
+   {
+      assert(vec2_rval->type == glsl_type::vec2_type);
+
+      ir_variable *v = factory.make_temp(glsl_type::vec2_type,
+                                         "tmp_split_pack_half_2x16_v");
+      factory.emit(assign(v, vec2_rval));
+
+      return expr(ir_binop_pack_half_2x16_split, swizzle_x(v), swizzle_y(v));
+   }
+
+   /**
+    * \brief Lower the component-wise calculation of unpackHalf2x16.
+    *
+    * Given a uint that encodes a float16 in its lower 16 bits, this function
+    * returns a uint that encodes a float32 with the same value. The sign bit
+    * of the float16 is ignored.
+    *
+    * \param e_rval is the unshifted exponent bits of a float16
+    * \param m_rval is the unshifted mantissa bits of a float16
+    * \param a uint rvalue that encodes a float32
+    */
+   ir_rvalue*
+   unpack_half_1x16_nosign(ir_rvalue *e_rval, ir_rvalue *m_rval)
+   {
+      assert(e_rval->type == glsl_type::uint_type);
+      assert(m_rval->type == glsl_type::uint_type);
+
+      /* uint u32; */
+      ir_variable *u32 = factory.make_temp(glsl_type::uint_type,
+                                           "tmp_unpack_half_1x16_u32");
+
+      /* uint e = E_RVAL; */
+      ir_variable *e = factory.make_temp(glsl_type::uint_type,
+                                          "tmp_unpack_half_1x16_e");
+      factory.emit(assign(e, e_rval));
+
+      /* uint m = M_RVAL; */
+      ir_variable *m = factory.make_temp(glsl_type::uint_type,
+                                          "tmp_unpack_half_1x16_m");
+      factory.emit(assign(m, m_rval));
+
+      /* Preliminaries
+       * -------------
+       *
+       * For a float16, the bit layout is:
+       *
+       *   sign:     15
+       *   exponent: 10:14
+       *   mantissa: 0:9
+       *
+       * Let f16 be a float16 value. The sign, exponent, and mantissa
+       * determine its value thus:
+       *
+       *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
+       *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
+       *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
+       *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
+       *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
+       *
+       * where 0 <= m16 < 2^10.
+       *
+       * For a float32, the bit layout is:
+       *
+       *   sign: 31
+       *   exponent: 23:30
+       *   mantissa: 0:22
+       *
+       * Let f32 be a float32 value. The sign, exponent, and mantissa
+       * determine its value thus:
+       *
+       *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
+       *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
+       *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
+       *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
+       *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
+       *
+       * where 0 <= m32 < 2^23.
+       *
+       * Calculation
+       * -----------
+       * Our task is to compute s32, e32, m32 given f16.  Since this function
+       * ignores the sign bit, assume that s32 = s16 = 0.  There are several
+       * cases consider.
+       */
+
+      factory.emit(
+
+         /* Case 1) f16 is zero or subnormal.
+          *
+          *   The simplest method of calcuating f32 in this case is
+          *
+          *     f32 = f16                       (20)
+          *         = 2^(-14) * (m16 / 2^10)    (21)
+          *         = m16 / 2^(-24)             (22)
+          */
+
+         /* if (e16 == 0) { */
+         if_tree(equal(e, constant(0u)),
+
+            /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */
+            assign(u32, expr(ir_unop_bitcast_f2u,
+                                div(u2f(m), constant((float)(1 << 24))))),
+
+         /* Case 2) f16 is normal.
+          *
+          *   The equation
+          *
+          *     f32 = f16                              (30)
+          *     2^(e32 - 127) * (1 + m32 / 2^23) =     (31)
+          *       2^(e16 - 15) * (1 + m16 / 2^10)
+          *
+          *   can be decomposed into two
+          *
+          *     2^(e32 - 127) = 2^(e16 - 15)           (32)
+          *     1 + m32 / 2^23 = 1 + m16 / 2^10        (33)
+          *
+          *   which solve to
+          *
+          *     e32 = e16 + 112                        (34)
+          *     m32 = m16 * 2^13                       (35)
+          */
+
+         /* } else if (e16 < 31)) { */
+         if_tree(less(e, constant(31u << 10u)),
+
+              /* u32 = ((e + (112 << 10)) | m) << 13;
+               */
+              assign(u32, lshift(bit_or(add(e, constant(112u << 10u)), m),
+                                 constant(13u))),
+
+
+         /* Case 3) f16 is infinite. */
+         if_tree(equal(m, constant(0u)),
+
+                 assign(u32, constant(255u << 23u)),
+
+         /* Case 4) f16 is NaN. */
+         /* } else { */
+
+            assign(u32, constant(0x7fffffffu))))));
+
+         /* } */
+
+      return deref(u32).val;
+   }
+
+   /**
+    * \brief Lower an unpackHalf2x16 expression.
+    *
+    * \param uint_rval is unpackHalf2x16's input
+    * \return unpackHalf2x16's output as a vec2 rvalue
+    */
+   ir_rvalue*
+   lower_unpack_half_2x16(ir_rvalue *uint_rval)
+   {
+      /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
+       *
+       *    mediump vec2 unpackHalf2x16 (highp uint v)
+       *    ------------------------------------------
+       *    Returns a two-component floating-point vector with components
+       *    obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
+       *    values, interpreting those values as 16-bit floating-point numbers
+       *    according to the OpenGL ES Specification, and converting them to
+       *    32-bit floating-point values.
+       *
+       *    The first component of the vector is obtained from the
+       *    16 least-significant bits of v; the second component is obtained
+       *    from the 16 most-significant bits of v.
+       */
+      assert(uint_rval->type == glsl_type::uint_type);
+
+      /* uint u = RVALUE;
+       * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
+       */
+      ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
+                                            "tmp_unpack_half_2x16_f16");
+      factory.emit(assign(f16, unpack_uint_to_uvec2(uint_rval)));
+
+      /* uvec2 f32; */
+      ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
+                                            "tmp_unpack_half_2x16_f32");
+
+      /* Get f16's unshifted exponent bits.
+       *
+       *    uvec2 e = f16 & 0x7c00u;
+       */
+      ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
+                                          "tmp_unpack_half_2x16_e");
+      factory.emit(assign(e, bit_and(f16, constant(0x7c00u))));
+
+      /* Get f16's unshifted mantissa bits.
+       *
+       *    uvec2 m = f16 & 0x03ffu;
+       */
+      ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
+                                          "tmp_unpack_half_2x16_m");
+      factory.emit(assign(m, bit_and(f16, constant(0x03ffu))));
+
+      /* Set f32's exponent and mantissa bits.
+       *
+       *   f32.x = unpack_half_1x16_nosign(e.x, m.x);
+       *   f32.y = unpack_half_1x16_nosign(e.y, m.y);
+       */
+      factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_x(e),
+                                                       swizzle_x(m)),
+                           WRITEMASK_X));
+      factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_y(e),
+                                                       swizzle_y(m)),
+                           WRITEMASK_Y));
+
+      /* Set f32's sign bit.
+       *
+       *    f32 |= (f16 & 0x8000u) << 16u;
+       */
+      factory.emit(assign(f32, bit_or(f32,
+                                       lshift(bit_and(f16,
+                                                      constant(0x8000u)),
+                                              constant(16u)))));
+
+      /* return bitcast_u2f(f32); */
+      ir_rvalue *result = expr(ir_unop_bitcast_u2f, f32);
+      assert(result->type == glsl_type::vec2_type);
+      return result;
+   }
+
+   /**
+    * \brief Split unpackHalf2x16 into two operations.
+    *
+    * \param uint_rval is unpackHalf2x16's input
+    * \return a vec2 rvalue
+    *
+    * Some code generators, such as the i965 fragment shader, require that all
+    * vector expressions be lowered to a sequence of scalar expressions.
+    * However, unpackHalf2x16 cannot be scalarized by the same method as
+    * a true vector operation because the number of components of its input
+    * and output differ.
+    *
+    * This method scalarizes unpackHalf2x16 by transforming it from a single
+    * operation having vec2 output to a pair of operations each having float
+    * output. That is, it transforms
+    *
+    *   unpackHalf2x16(UINT_RVAL)
+    *
+    * into
+    *
+    *   uint u = UINT_RVAL;
+    *   vec2 v;
+    *
+    *   v.x = unpackHalf2x16_split_x(u);
+    *   v.y = unpackHalf2x16_split_y(u);
+    *
+    *   return v;
+    */
+   ir_rvalue*
+   split_unpack_half_2x16(ir_rvalue *uint_rval)
+   {
+      assert(uint_rval->type == glsl_type::uint_type);
+
+      /* uint u = uint_rval; */
+      ir_variable *u = factory.make_temp(glsl_type::uint_type,
+                                          "tmp_split_unpack_half_2x16_u");
+      factory.emit(assign(u, uint_rval));
+
+      /* vec2 v; */
+      ir_variable *v = factory.make_temp(glsl_type::vec2_type,
+                                          "tmp_split_unpack_half_2x16_v");
+
+      /* v.x = unpack_half_2x16_split_x(u); */
+      factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_x, u),
+                           WRITEMASK_X));
+
+      /* v.y = unpack_half_2x16_split_y(u); */
+      factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_y, u),
+                           WRITEMASK_Y));
+
+      return deref(v).val;
+   }
+};
+
+} // namespace anonymous
+
+/**
+ * \brief Lower the builtin packing functions.
+ *
+ * \param op_mask is a bitmask of `enum lower_packing_builtins_op`.
+ */
+bool
+lower_packing_builtins(exec_list *instructions, int op_mask)
+{
+   lower_packing_builtins_visitor v(op_mask);
+   visit_list_elements(&v, instructions, true);
+   return v.get_progress();
+}
diff --git a/mesalib/src/glsl/lower_ubo_reference.cpp b/mesalib/src/glsl/lower_ubo_reference.cpp
index e8d2c4742..026197df7 100644
--- a/mesalib/src/glsl/lower_ubo_reference.cpp
+++ b/mesalib/src/glsl/lower_ubo_reference.cpp
@@ -61,10 +61,58 @@ public:
    bool progress;
 };
 
-static inline unsigned int
-align(unsigned int a, unsigned int align)
+/**
+ * Determine the name of the interface block field
+ *
+ * This is the name of the specific member as it would appear in the
+ * \c gl_uniform_buffer_variable::Name field in the shader's
+ * \c UniformBlocks array.
+ */
+static const char *
+interface_field_name(void *mem_ctx, char *base_name, ir_dereference *d)
 {
-   return (a + align - 1) / align * align;
+   ir_constant *previous_index = NULL;
+
+   while (d != NULL) {
+      switch (d->ir_type) {
+      case ir_type_dereference_variable: {
+         ir_dereference_variable *v = (ir_dereference_variable *) d;
+         if (previous_index
+             && v->var->is_interface_instance()
+             && v->var->type->is_array())
+            return ralloc_asprintf(mem_ctx,
+                                   "%s[%d]",
+                                   base_name,
+                                   previous_index->get_uint_component(0));
+         else
+            return base_name;
+
+         break;
+      }
+
+      case ir_type_dereference_record: {
+         ir_dereference_record *r = (ir_dereference_record *) d;
+
+         d = r->record->as_dereference();
+         break;
+      }
+
+      case ir_type_dereference_array: {
+         ir_dereference_array *a = (ir_dereference_array *) d;
+
+         d = a->array->as_dereference();
+         previous_index = a->array_index->as_constant();
+         break;
+      }
+
+      default:
+         assert(!"Should not get here.");
+         break;
+      }
+   }
+
+   assert(!"Should not get here.");
+   return NULL;
 }
 
 void
@@ -78,13 +126,30 @@ lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
       return;
 
    ir_variable *var = deref->variable_referenced();
-   if (!var || var->uniform_block == -1)
+   if (!var || !var->is_in_uniform_block())
       return;
 
    mem_ctx = ralloc_parent(*rvalue);
-   uniform_block = var->uniform_block;
-   struct gl_uniform_block *block = &shader->UniformBlocks[uniform_block];
-   this->ubo_var = &block->Uniforms[var->location];
+
+   const char *const field_name =
+      interface_field_name(mem_ctx, (char *) var->interface_type->name, deref);
+
+   this->uniform_block = -1;
+   for (unsigned i = 0; i < shader->NumUniformBlocks; i++) {
+      if (strcmp(field_name, shader->UniformBlocks[i].Name) == 0) {
+         this->uniform_block = i;
+
+         struct gl_uniform_block *block = &shader->UniformBlocks[i];
+
+         this->ubo_var = var->is_interface_instance()
+            ? &block->Uniforms[0] : &block->Uniforms[var->location];
+
+         break;
+      }
+   }
+
+   assert(this->uniform_block != (unsigned) -1);
+
    ir_rvalue *offset = new(mem_ctx) ir_constant(0u);
    unsigned const_offset = 0;
    bool row_major = ubo_var->RowMajor;
@@ -111,9 +176,21 @@ lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
 	     * vector) is handled below in emit_ubo_loads.
 	     */
 	    array_stride = 4;
+         } else if (deref_array->type->is_interface()) {
+            /* We're processing an array dereference of an interface instance
+	     * array.  The thing being dereferenced *must* be a variable
+	     * dereference because intefaces cannot be embedded an other
+	     * types.  In terms of calculating the offsets for the lowering
+	     * pass, we don't care about the array index.  All elements of an
+	     * interface instance array will have the same offsets relative to
+	     * the base of the block that backs them.
+             */
+            assert(deref_array->array->as_dereference_variable());
+            deref = deref_array->array->as_dereference();
+            break;
 	 } else {
 	    array_stride = deref_array->type->std140_size(row_major);
-	    array_stride = align(array_stride, 16);
+	    array_stride = glsl_align(array_stride, 16);
 	 }
 
 	 ir_constant *const_index = deref_array->array_index->as_constant();
@@ -138,7 +215,7 @@ lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
 	    const glsl_type *type = struct_type->fields.structure[i].type;
 	    unsigned field_align = type->std140_base_alignment(row_major);
 	    max_field_align = MAX2(field_align, max_field_align);
-	    intra_struct_offset = align(intra_struct_offset, field_align);
+	    intra_struct_offset = glsl_align(intra_struct_offset, field_align);
 
 	    if (strcmp(struct_type->fields.structure[i].name,
 		       deref_record->field) == 0)
@@ -146,7 +223,7 @@ lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
 	    intra_struct_offset += type->std140_size(row_major);
 	 }
 
-	 const_offset = align(const_offset, max_field_align);
+	 const_offset = glsl_align(const_offset, max_field_align);
 	 const_offset += intra_struct_offset;
 
 	 deref = deref_record->record->as_dereference();
@@ -217,8 +294,8 @@ lower_ubo_reference_visitor::emit_ubo_loads(ir_dereference *deref,
 					       field->name);
 
 	 field_offset =
-	    align(field_offset,
-		  field->type->std140_base_alignment(ubo_var->RowMajor));
+	    glsl_align(field_offset,
+		       field->type->std140_base_alignment(ubo_var->RowMajor));
 
 	 emit_ubo_loads(field_deref, base_offset, deref_offset + field_offset);
 
@@ -229,7 +306,8 @@ lower_ubo_reference_visitor::emit_ubo_loads(ir_dereference *deref,
 
    if (deref->type->is_array()) {
       unsigned array_stride =
-	 align(deref->type->fields.array->std140_size(ubo_var->RowMajor), 16);
+	 glsl_align(deref->type->fields.array->std140_size(ubo_var->RowMajor),
+		    16);
 
       for (unsigned i = 0; i < deref->type->length; i++) {
 	 ir_constant *element = new(mem_ctx) ir_constant(i);
diff --git a/mesalib/src/glsl/lower_variable_index_to_cond_assign.cpp b/mesalib/src/glsl/lower_variable_index_to_cond_assign.cpp
index 57771074a..040b0bf83 100644
--- a/mesalib/src/glsl/lower_variable_index_to_cond_assign.cpp
+++ b/mesalib/src/glsl/lower_variable_index_to_cond_assign.cpp
@@ -364,12 +364,16 @@ public:
 	 return this->lower_temps;
       case ir_var_uniform:
 	 return this->lower_uniforms;
-      case ir_var_in:
+      case ir_var_function_in:
       case ir_var_const_in:
-	 return (var->location == -1) ? this->lower_temps : this->lower_inputs;
-      case ir_var_out:
-	 return (var->location == -1) ? this->lower_temps : this->lower_outputs;
-      case ir_var_inout:
+         return this->lower_temps;
+      case ir_var_shader_in:
+         return this->lower_inputs;
+      case ir_var_function_out:
+         return this->lower_temps;
+      case ir_var_shader_out:
+         return this->lower_outputs;
+      case ir_var_function_inout:
 	 return this->lower_temps;
       }
 
diff --git a/mesalib/src/glsl/opt_constant_folding.cpp b/mesalib/src/glsl/opt_constant_folding.cpp
index 7d94d481c..072fefe9a 100644
--- a/mesalib/src/glsl/opt_constant_folding.cpp
+++ b/mesalib/src/glsl/opt_constant_folding.cpp
@@ -127,7 +127,8 @@ ir_constant_folding_visitor::visit_enter(ir_call *ir)
       ir_rvalue *param_rval = (ir_rvalue *)iter.get();
       ir_variable *sig_param = (ir_variable *)sig_iter.get();
 
-      if (sig_param->mode == ir_var_in || sig_param->mode == ir_var_const_in) {
+      if (sig_param->mode == ir_var_function_in
+          || sig_param->mode == ir_var_const_in) {
 	 ir_rvalue *new_param = param_rval;
 
 	 handle_rvalue(&new_param);
diff --git a/mesalib/src/glsl/opt_constant_propagation.cpp b/mesalib/src/glsl/opt_constant_propagation.cpp
index a03811999..2f65937fe 100644
--- a/mesalib/src/glsl/opt_constant_propagation.cpp
+++ b/mesalib/src/glsl/opt_constant_propagation.cpp
@@ -285,7 +285,8 @@ ir_constant_propagation_visitor::visit_enter(ir_call *ir)
    foreach_iter(exec_list_iterator, iter, ir->actual_parameters) {
       ir_variable *sig_param = (ir_variable *)sig_param_iter.get();
       ir_rvalue *param = (ir_rvalue *)iter.get();
-      if (sig_param->mode != ir_var_out && sig_param->mode != ir_var_inout) {
+      if (sig_param->mode != ir_var_function_out
+          && sig_param->mode != ir_var_function_inout) {
 	 ir_rvalue *new_param = param;
 	 handle_rvalue(&new_param);
          if (new_param != param)
diff --git a/mesalib/src/glsl/opt_constant_variable.cpp b/mesalib/src/glsl/opt_constant_variable.cpp
index 1bbaf8e47..cbe6450c6 100644
--- a/mesalib/src/glsl/opt_constant_variable.cpp
+++ b/mesalib/src/glsl/opt_constant_variable.cpp
@@ -137,8 +137,8 @@ ir_constant_variable_visitor::visit_enter(ir_call *ir)
       ir_rvalue *param_rval = (ir_rvalue *)iter.get();
       ir_variable *param = (ir_variable *)sig_iter.get();
 
-      if (param->mode == ir_var_out ||
-	  param->mode == ir_var_inout) {
+      if (param->mode == ir_var_function_out ||
+	  param->mode == ir_var_function_inout) {
 	 ir_variable *var = param_rval->variable_referenced();
 	 struct assignment_entry *entry;
 
diff --git a/mesalib/src/glsl/opt_copy_propagation.cpp b/mesalib/src/glsl/opt_copy_propagation.cpp
index 2952ce594..7282b611e 100644
--- a/mesalib/src/glsl/opt_copy_propagation.cpp
+++ b/mesalib/src/glsl/opt_copy_propagation.cpp
@@ -189,7 +189,8 @@ ir_copy_propagation_visitor::visit_enter(ir_call *ir)
    foreach_iter(exec_list_iterator, iter, ir->actual_parameters) {
       ir_variable *sig_param = (ir_variable *)sig_param_iter.get();
       ir_instruction *ir = (ir_instruction *)iter.get();
-      if (sig_param->mode != ir_var_out && sig_param->mode != ir_var_inout) {
+      if (sig_param->mode != ir_var_function_out
+          && sig_param->mode != ir_var_function_inout) {
          ir->accept(this);
       }
       sig_param_iter.next();
diff --git a/mesalib/src/glsl/opt_copy_propagation_elements.cpp b/mesalib/src/glsl/opt_copy_propagation_elements.cpp
index de9f4ef6f..6a19da40d 100644
--- a/mesalib/src/glsl/opt_copy_propagation_elements.cpp
+++ b/mesalib/src/glsl/opt_copy_propagation_elements.cpp
@@ -297,7 +297,8 @@ ir_copy_propagation_elements_visitor::visit_enter(ir_call *ir)
    foreach_iter(exec_list_iterator, iter, ir->actual_parameters) {
       ir_variable *sig_param = (ir_variable *)sig_param_iter.get();
       ir_instruction *ir = (ir_instruction *)iter.get();
-      if (sig_param->mode != ir_var_out && sig_param->mode != ir_var_inout) {
+      if (sig_param->mode != ir_var_function_out
+          && sig_param->mode != ir_var_function_inout) {
          ir->accept(this);
       }
       sig_param_iter.next();
diff --git a/mesalib/src/glsl/opt_dead_code.cpp b/mesalib/src/glsl/opt_dead_code.cpp
index 47247e20d..b65e5c2ce 100644
--- a/mesalib/src/glsl/opt_dead_code.cpp
+++ b/mesalib/src/glsl/opt_dead_code.cpp
@@ -77,10 +77,11 @@ do_dead_code(exec_list *instructions, bool uniform_locations_assigned)
 
       if (entry->assign) {
 	 /* Remove a single dead assignment to the variable we found.
-	  * Don't do so if it's a shader output, though.
+	  * Don't do so if it's a shader or function output, though.
 	  */
-	 if (entry->var->mode != ir_var_out &&
-	     entry->var->mode != ir_var_inout) {
+	 if (entry->var->mode != ir_var_function_out &&
+	     entry->var->mode != ir_var_function_inout &&
+             entry->var->mode != ir_var_shader_out) {
 	    entry->assign->remove();
 	    progress = true;
 
@@ -97,15 +98,10 @@ do_dead_code(exec_list *instructions, bool uniform_locations_assigned)
 	 /* uniform initializers are precious, and could get used by another
 	  * stage.  Also, once uniform locations have been assigned, the
 	  * declaration cannot be deleted.
-	  *
-	  * Also, GL_ARB_uniform_buffer_object says that std140
-	  * uniforms will not be eliminated.  Since we always do
-	  * std140, just don't eliminate uniforms in UBOs.
 	  */
 	 if (entry->var->mode == ir_var_uniform &&
 	     (uniform_locations_assigned ||
-	      entry->var->constant_value ||
-	      entry->var->uniform_block != -1))
+	      entry->var->constant_value))
 	    continue;
 
 	 entry->var->remove();
diff --git a/mesalib/src/glsl/opt_function_inlining.cpp b/mesalib/src/glsl/opt_function_inlining.cpp
index f9f5bd442..0733d5180 100644
--- a/mesalib/src/glsl/opt_function_inlining.cpp
+++ b/mesalib/src/glsl/opt_function_inlining.cpp
@@ -144,9 +144,9 @@ ir_call::generate_inline(ir_instruction *next_ir)
       }
 
       /* Move the actual param into our param variable if it's an 'in' type. */
-      if (parameters[i] && (sig_param->mode == ir_var_in ||
+      if (parameters[i] && (sig_param->mode == ir_var_function_in ||
 			    sig_param->mode == ir_var_const_in ||
-			    sig_param->mode == ir_var_inout)) {
+			    sig_param->mode == ir_var_function_inout)) {
 	 ir_assignment *assign;
 
 	 assign = new(ctx) ir_assignment(new(ctx) ir_dereference_variable(parameters[i]),
@@ -202,8 +202,8 @@ ir_call::generate_inline(ir_instruction *next_ir)
       const ir_variable *const sig_param = (ir_variable *) sig_param_iter.get();
 
       /* Move our param variable into the actual param if it's an 'out' type. */
-      if (parameters[i] && (sig_param->mode == ir_var_out ||
-			    sig_param->mode == ir_var_inout)) {
+      if (parameters[i] && (sig_param->mode == ir_var_function_out ||
+			    sig_param->mode == ir_var_function_inout)) {
 	 ir_assignment *assign;
 
 	 assign = new(ctx) ir_assignment(param->clone(ctx, NULL)->as_rvalue(),
diff --git a/mesalib/src/glsl/opt_structure_splitting.cpp b/mesalib/src/glsl/opt_structure_splitting.cpp
index 9b3f048e4..806c079e5 100644
--- a/mesalib/src/glsl/opt_structure_splitting.cpp
+++ b/mesalib/src/glsl/opt_structure_splitting.cpp
@@ -104,7 +104,8 @@ ir_structure_reference_visitor::get_variable_entry(ir_variable *var)
 {
    assert(var);
 
-   if (!var->type->is_record() || var->mode == ir_var_uniform)
+   if (!var->type->is_record() || var->mode == ir_var_uniform
+       || var->mode == ir_var_shader_in || var->mode == ir_var_shader_out)
       return NULL;
 
    foreach_iter(exec_list_iterator, iter, this->variable_list) {
diff --git a/mesalib/src/glsl/opt_tree_grafting.cpp b/mesalib/src/glsl/opt_tree_grafting.cpp
index 25b18ea94..113abb7b0 100644
--- a/mesalib/src/glsl/opt_tree_grafting.cpp
+++ b/mesalib/src/glsl/opt_tree_grafting.cpp
@@ -211,7 +211,8 @@ ir_tree_grafting_visitor::visit_enter(ir_call *ir)
       ir_rvalue *ir = (ir_rvalue *)iter.get();
       ir_rvalue *new_ir = ir;
 
-      if (sig_param->mode != ir_var_in && sig_param->mode != ir_var_const_in) {
+      if (sig_param->mode != ir_var_function_in
+          && sig_param->mode != ir_var_const_in) {
 	 if (check_graft(ir, sig_param) == visit_stop)
 	    return visit_stop;
 	 continue;
@@ -350,8 +351,9 @@ tree_grafting_basic_block(ir_instruction *bb_first,
       if (!lhs_var)
 	 continue;
 
-      if (lhs_var->mode == ir_var_out ||
-	  lhs_var->mode == ir_var_inout)
+      if (lhs_var->mode == ir_var_function_out ||
+	  lhs_var->mode == ir_var_function_inout ||
+          lhs_var->mode == ir_var_shader_out)
 	 continue;
 
       ir_variable_refcount_entry *entry = info->refs->get_variable_entry(lhs_var);
diff --git a/mesalib/src/glsl/s_expression.cpp b/mesalib/src/glsl/s_expression.cpp
index 57de9d334..1bdf6bca6 100644
--- a/mesalib/src/glsl/s_expression.cpp
+++ b/mesalib/src/glsl/s_expression.cpp
@@ -66,18 +66,18 @@ read_atom(void *ctx, const char *&src, char *&symbol_buffer)
       return NULL; // no atom
 
    // Check for the special symbol '+INF', which means +Infinity.  Note: C99
-   // requires strtod to parse '+INF' as +Infinity, but we still support some
+   // requires strtof to parse '+INF' as +Infinity, but we still support some
    // non-C99-compliant compilers (e.g. MSVC).
    if (n == 4 && strncmp(src, "+INF", 4) == 0) {
       expr = new(ctx) s_float(std::numeric_limits<float>::infinity());
    } else {
       // Check if the atom is a number.
       char *float_end = NULL;
-      double f = glsl_strtod(src, &float_end);
+      float f = glsl_strtof(src, &float_end);
       if (float_end != src) {
          char *int_end = NULL;
          int i = strtol(src, &int_end, 10);
-         // If strtod matched more characters, it must have a decimal part
+         // If strtof matched more characters, it must have a decimal part
          if (float_end > int_end)
             expr = new(ctx) s_float(f);
          else
diff --git a/mesalib/src/glsl/standalone_scaffolding.cpp b/mesalib/src/glsl/standalone_scaffolding.cpp
index 33d3804c6..0fb4f5b16 100644
--- a/mesalib/src/glsl/standalone_scaffolding.cpp
+++ b/mesalib/src/glsl/standalone_scaffolding.cpp
@@ -34,6 +34,24 @@
 #include "ralloc.h"
 
 void
+_mesa_warning(struct gl_context *ctx, const char *fmt, ...)
+{
+    va_list vargs;
+    (void) ctx;
+
+    va_start(vargs, fmt);
+
+    /* This output is not thread-safe, but that's good enough for the
+     * standalone compiler.
+     */
+    fprintf(stderr, "Mesa warning: ");
+    vfprintf(stderr, fmt, vargs);
+    fprintf(stderr, "\n");
+
+    va_end(vargs);
+}
+
+void
 _mesa_reference_shader(struct gl_context *ctx, struct gl_shader **ptr,
                        struct gl_shader *sh)
 {
@@ -81,6 +99,7 @@ void initialize_context_to_defaults(struct gl_context *ctx, gl_api api)
    ctx->Extensions.EXT_texture3D = true;
    ctx->Extensions.OES_EGL_image_external = true;
    ctx->Extensions.ARB_shader_bit_encoding = true;
+   ctx->Extensions.ARB_shading_language_packing = true;
    ctx->Extensions.OES_standard_derivatives = true;
    ctx->Extensions.ARB_texture_cube_map_array = true;
 
diff --git a/mesalib/src/glsl/standalone_scaffolding.h b/mesalib/src/glsl/standalone_scaffolding.h
index 41ce35bef..096b2f114 100644
--- a/mesalib/src/glsl/standalone_scaffolding.h
+++ b/mesalib/src/glsl/standalone_scaffolding.h
@@ -34,6 +34,9 @@
 #include "main/mtypes.h"
 
 extern "C" void
+_mesa_warning(struct gl_context *ctx, const char *fmtString, ... );
+
+extern "C" void
 _mesa_reference_shader(struct gl_context *ctx, struct gl_shader **ptr,
                        struct gl_shader *sh);
 
diff --git a/mesalib/src/glsl/strtod.c b/mesalib/src/glsl/strtod.c
index 47c1f0ed6..5d4346b5a 100644
--- a/mesalib/src/glsl/strtod.c
+++ b/mesalib/src/glsl/strtod.c
@@ -55,3 +55,25 @@ glsl_strtod(const char *s, char **end)
    return strtod(s, end);
 #endif
 }
+
+
+/**
+ * Wrapper around strtof which uses the "C" locale so the decimal
+ * point is always '.'
+ */
+float
+glsl_strtof(const char *s, char **end)
+{
+#if defined(_GNU_SOURCE) && !defined(__CYGWIN__) && !defined(__FreeBSD__) && \
+   !defined(__HAIKU__) && !defined(__UCLIBC__)
+   static locale_t loc = NULL;
+   if (!loc) {
+      loc = newlocale(LC_CTYPE_MASK, "C", NULL);
+   }
+   return strtof_l(s, end, loc);
+#elif _XOPEN_SOURCE >= 600 || _ISOC99_SOURCE
+   return strtof(s, end);
+#else
+   return (float) strtod(s, end);
+#endif
+}
diff --git a/mesalib/src/glsl/strtod.h b/mesalib/src/glsl/strtod.h
index 0cf6409d4..ad847dbb0 100644
--- a/mesalib/src/glsl/strtod.h
+++ b/mesalib/src/glsl/strtod.h
@@ -34,6 +34,9 @@ extern "C" {
 extern double
 glsl_strtod(const char *s, char **end);
 
+extern float
+glsl_strtof(const char *s, char **end);
+
 
 #ifdef __cplusplus
 }
diff --git a/mesalib/src/mesa/Android.libmesa_glsl_utils.mk b/mesalib/src/mesa/Android.libmesa_glsl_utils.mk
index 9c5f3493c..47f2e151b 100644
--- a/mesalib/src/mesa/Android.libmesa_glsl_utils.mk
+++ b/mesalib/src/mesa/Android.libmesa_glsl_utils.mk
@@ -35,10 +35,13 @@ include $(CLEAR_VARS)
 
 LOCAL_MODULE := libmesa_glsl_utils
 
-LOCAL_C_INCLUDES := $(MESA_TOP)/src/glsl
+LOCAL_C_INCLUDES := \
+	$(MESA_TOP)/src/glsl \
+	$(MESA_TOP)/src/mapi
 
 LOCAL_SRC_FILES := \
 	main/hash_table.c \
+	main/imports.c \
 	program/prog_hash_table.c \
 	program/symbol_table.c
 
@@ -54,10 +57,13 @@ include $(CLEAR_VARS)
 LOCAL_MODULE := libmesa_glsl_utils
 LOCAL_IS_HOST_MODULE := true
 
-LOCAL_C_INCLUDES := $(MESA_TOP)/src/glsl
+LOCAL_C_INCLUDES := \
+	$(MESA_TOP)/src/glsl \
+	$(MESA_TOP)/src/mapi
 
 LOCAL_SRC_FILES := \
 	main/hash_table.c \
+	main/imports.c \
 	program/prog_hash_table.c \
 	program/symbol_table.c
 
diff --git a/mesalib/src/mesa/main/extensions.c b/mesalib/src/mesa/main/extensions.c
index 5d01ac8ea..7ae07fb5a 100644
--- a/mesalib/src/mesa/main/extensions.c
+++ b/mesalib/src/mesa/main/extensions.c
@@ -125,6 +125,7 @@ static const struct extension extension_table[] = {
    { "GL_ARB_shader_stencil_export",               o(ARB_shader_stencil_export),               GL,             2009 },
    { "GL_ARB_shader_texture_lod",                  o(ARB_shader_texture_lod),                  GL,             2009 },
    { "GL_ARB_shading_language_100",                o(ARB_shading_language_100),                GLL,            2003 },
+   { "GL_ARB_shading_language_packing",            o(ARB_shading_language_packing),            GL,             2011 },
    { "GL_ARB_shadow",                              o(ARB_shadow),                              GLL,            2001 },
    { "GL_ARB_sync",                                o(ARB_sync),                                GL,             2003 },
    { "GL_ARB_texture_border_clamp",                o(ARB_texture_border_clamp),                GLL,            2000 },
diff --git a/mesalib/src/mesa/main/getstring.c b/mesalib/src/mesa/main/getstring.c
index 1f23cc0a4..aa3a528fd 100644
--- a/mesalib/src/mesa/main/getstring.c
+++ b/mesalib/src/mesa/main/getstring.c
@@ -74,7 +74,9 @@ shading_language_version(struct gl_context *ctx)
       break;
 
    case API_OPENGLES2:
-      return (const GLubyte *) "OpenGL ES GLSL ES 1.0.16";
+      return (ctx->Version < 30)
+         ? (const GLubyte *) "OpenGL ES GLSL ES 1.0.16"
+         : (const GLubyte *) "OpenGL ES GLSL ES 3.0";
 
    case API_OPENGLES:
       /* fall-through */
diff --git a/mesalib/src/mesa/main/imports.c b/mesalib/src/mesa/main/imports.c
index 76f835e0e..e6f754254 100644
--- a/mesalib/src/mesa/main/imports.c
+++ b/mesalib/src/mesa/main/imports.c
@@ -314,10 +314,43 @@ _mesa_bitcount_64(uint64_t n)
 #endif
 
 
+/* Using C99 rounding functions for roundToEven() implementation is
+ * difficult, because round(), rint, and nearbyint() are affected by
+ * fesetenv(), which the application may have done for its own
+ * purposes.  Mesa's IROUND macro is close to what we want, but it
+ * rounds away from 0 on n + 0.5.
+ */
+int
+_mesa_round_to_even(float val)
+{
+   int rounded = IROUND(val);
+
+   if (val - floor(val) == 0.5) {
+      if (rounded % 2 != 0)
+         rounded += val > 0 ? -1 : 1;
+   }
+
+   return rounded;
+}
+
+
 /**
  * Convert a 4-byte float to a 2-byte half float.
- * Based on code from:
- * http://www.opengl.org/discussion_boards/ubb/Forum3/HTML/008786.html
+ *
+ * Not all float32 values can be represented exactly as a float16 value. We
+ * round such intermediate float32 values to the nearest float16. When the
+ * float32 lies exactly between to float16 values, we round to the one with
+ * an even mantissa.
+ *
+ * This rounding behavior has several benefits:
+ *   - It has no sign bias.
+ *
+ *   - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
+ *     GPU ISA.
+ *
+ *   - By reproducing the behavior of the GPU (at least on Intel hardware),
+ *     compile-time evaluation of constant packHalf2x16 GLSL expressions will
+ *     result in the same value as if the expression were executed on the GPU.
  */
 GLhalfARB
 _mesa_float_to_half(float val)
@@ -356,32 +389,13 @@ _mesa_float_to_half(float val)
    else {
       /* regular number */
       const int new_exp = flt_e - 127;
-      if (new_exp < -24) {
-         /* this maps to 0 */
-         /* m = 0; - already set */
-         e = 0;
-      }
-      else if (new_exp < -14) {
-         /* this maps to a denorm */
-         unsigned int exp_val = (unsigned int) (-14 - new_exp); /* 2^-exp_val*/
+      if (new_exp < -14) {
+         /* The float32 lies in the range (0.0, min_normal16) and is rounded
+          * to a nearby float16 value. The result will be either zero, subnormal,
+          * or normal.
+          */
          e = 0;
-         switch (exp_val) {
-            case 0:
-               _mesa_warning(NULL,
-                   "float_to_half: logical error in denorm creation!\n");
-               /* m = 0; - already set */
-               break;
-            case 1: m = 512 + (flt_m >> 14); break;
-            case 2: m = 256 + (flt_m >> 15); break;
-            case 3: m = 128 + (flt_m >> 16); break;
-            case 4: m = 64 + (flt_m >> 17); break;
-            case 5: m = 32 + (flt_m >> 18); break;
-            case 6: m = 16 + (flt_m >> 19); break;
-            case 7: m = 8 + (flt_m >> 20); break;
-            case 8: m = 4 + (flt_m >> 21); break;
-            case 9: m = 2 + (flt_m >> 22); break;
-            case 10: m = 1; break;
-         }
+         m = _mesa_round_to_even((1 << 24) * fabsf(fi.f));
       }
       else if (new_exp > 15) {
          /* map this value to infinity */
@@ -389,12 +403,26 @@ _mesa_float_to_half(float val)
          e = 31;
       }
       else {
-         /* regular */
+         /* The float32 lies in the range
+          *   [min_normal16, max_normal16 + max_step16)
+          * and is rounded to a nearby float16 value. The result will be
+          * either normal or infinite.
+          */
          e = new_exp + 15;
-         m = flt_m >> 13;
+         m = _mesa_round_to_even(flt_m / (float) (1 << 13));
       }
    }
 
+   assert(0 <= m && m <= 1024);
+   if (m == 1024) {
+      /* The float32 was rounded upwards into the range of the next exponent,
+       * so bump the exponent. This correctly handles the case where f32
+       * should be rounded up to float16 infinity.
+       */
+      ++e;
+      m = 0;
+   }
+
    result = (s << 15) | (e << 10) | m;
    return result;
 }
diff --git a/mesalib/src/mesa/main/imports.h b/mesalib/src/mesa/main/imports.h
index 8446ea2a3..4b783818b 100644
--- a/mesalib/src/mesa/main/imports.h
+++ b/mesalib/src/mesa/main/imports.h
@@ -548,6 +548,9 @@ _mesa_fls(unsigned int n)
 #endif
 }
 
+extern int
+_mesa_round_to_even(float val);
+
 extern GLhalfARB
 _mesa_float_to_half(float f);
 
diff --git a/mesalib/src/mesa/main/mtypes.h b/mesalib/src/mesa/main/mtypes.h
index d37e6c4c0..3369623f7 100644
--- a/mesalib/src/mesa/main/mtypes.h
+++ b/mesalib/src/mesa/main/mtypes.h
@@ -2273,11 +2273,30 @@ typedef enum
 struct gl_uniform_buffer_variable
 {
    char *Name;
+
+   /**
+    * Name of the uniform as seen by glGetUniformIndices.
+    *
+    * glGetUniformIndices requires that the block instance index \b not be
+    * present in the name of queried uniforms.
+    *
+    * \note
+    * \c gl_uniform_buffer_variable::IndexName and
+    * \c gl_uniform_buffer_variable::Name may point to identical storage.
+    */
+   char *IndexName;
+
    const struct glsl_type *Type;
    unsigned int Offset;
    GLboolean RowMajor;
 };
 
+enum gl_uniform_block_packing {
+   ubo_packing_std140,
+   ubo_packing_shared,
+   ubo_packing_packed
+};
+
 struct gl_uniform_block
 {
    /** Declared name of the uniform block */
@@ -2299,6 +2318,14 @@ struct gl_uniform_block
     * (GL_UNIFORM_BLOCK_DATA_SIZE).
     */
    GLuint UniformBufferSize;
+
+   /**
+    * Layout specified in the shader
+    *
+    * This isn't accessible through the API, but it is used while
+    * cross-validating uniform blocks.
+    */
+   enum gl_uniform_block_packing _Packing;
 };
 
 /**
@@ -3042,6 +3069,7 @@ struct gl_extensions
    GLboolean ARB_shader_stencil_export;
    GLboolean ARB_shader_texture_lod;
    GLboolean ARB_shading_language_100;
+   GLboolean ARB_shading_language_packing;
    GLboolean ARB_shadow;
    GLboolean ARB_sync;
    GLboolean ARB_texture_border_clamp;
diff --git a/mesalib/src/mesa/main/remap.c b/mesalib/src/mesa/main/remap.c
index c89fba453..a09870561 100644
--- a/mesalib/src/mesa/main/remap.c
+++ b/mesalib/src/mesa/main/remap.c
@@ -208,8 +208,10 @@ _mesa_do_init_remap_table(const char *pool,
       offset = _mesa_map_function_spec(spec);
       /* store the dispatch offset in the remap table */
       driDispatchRemapTable[i] = offset;
-      if (offset < 0)
-         _mesa_warning(NULL, "failed to remap index %d", i);
+      if (offset < 0) {
+         const char *name = spec + strlen(spec) + 1;
+         _mesa_warning(NULL, "failed to remap %s", name);
+      }
    }
 }
 
diff --git a/mesalib/src/mesa/main/shader_query.cpp b/mesalib/src/mesa/main/shader_query.cpp
index 27b1b8f56..3014a9778 100644
--- a/mesalib/src/mesa/main/shader_query.cpp
+++ b/mesalib/src/mesa/main/shader_query.cpp
@@ -106,7 +106,7 @@ _mesa_GetActiveAttrib(GLhandleARB program, GLuint desired_index,
       const ir_variable *const var = ((ir_instruction *) node)->as_variable();
 
       if (var == NULL
-	  || var->mode != ir_var_in
+	  || var->mode != ir_var_shader_in
 	  || var->location == -1)
 	 continue;
 
@@ -169,7 +169,7 @@ _mesa_GetAttribLocation(GLhandleARB program, const GLcharARB * name)
        *     attribute, or if an error occurs, -1 will be returned."
        */
       if (var == NULL
-	  || var->mode != ir_var_in
+	  || var->mode != ir_var_shader_in
 	  || var->location == -1
 	  || var->location < VERT_ATTRIB_GENERIC0)
 	 continue;
@@ -197,7 +197,7 @@ _mesa_count_active_attribs(struct gl_shader_program *shProg)
       const ir_variable *const var = ((ir_instruction *) node)->as_variable();
 
       if (var == NULL
-	  || var->mode != ir_var_in
+	  || var->mode != ir_var_shader_in
 	  || var->location == -1)
 	 continue;
 
@@ -223,7 +223,7 @@ _mesa_longest_attribute_name_length(struct gl_shader_program *shProg)
       const ir_variable *const var = ((ir_instruction *) node)->as_variable();
 
       if (var == NULL
-	  || var->mode != ir_var_in
+	  || var->mode != ir_var_shader_in
 	  || var->location == -1)
 	 continue;
 
@@ -333,7 +333,7 @@ _mesa_GetFragDataIndex(GLuint program, const GLchar *name)
        *     attribute, or if an error occurs, -1 will be returned."
        */
       if (var == NULL
-          || var->mode != ir_var_out
+          || var->mode != ir_var_shader_out
           || var->location == -1
           || var->location < FRAG_RESULT_DATA0)
          continue;
@@ -389,7 +389,7 @@ _mesa_GetFragDataLocation(GLuint program, const GLchar *name)
        *     attribute, or if an error occurs, -1 will be returned."
        */
       if (var == NULL
-	  || var->mode != ir_var_out
+	  || var->mode != ir_var_shader_out
 	  || var->location == -1
 	  || var->location < FRAG_RESULT_DATA0)
 	 continue;
diff --git a/mesalib/src/mesa/main/texparam.c b/mesalib/src/mesa/main/texparam.c
index 8d0ae16fb..52ede13c0 100644
--- a/mesalib/src/mesa/main/texparam.c
+++ b/mesalib/src/mesa/main/texparam.c
@@ -1388,10 +1388,10 @@ _mesa_GetTexParameterfv( GLenum target, GLenum pname, GLfloat *params )
          if (ctx->API != API_OPENGLES || !ctx->Extensions.OES_draw_texture)
             goto invalid_pname;
 
-         params[0] = obj->CropRect[0];
-         params[1] = obj->CropRect[1];
-         params[2] = obj->CropRect[2];
-         params[3] = obj->CropRect[3];
+         params[0] = (GLfloat) obj->CropRect[0];
+         params[1] = (GLfloat) obj->CropRect[1];
+         params[2] = (GLfloat) obj->CropRect[2];
+         params[3] = (GLfloat) obj->CropRect[3];
          break;
 
       case GL_TEXTURE_SWIZZLE_R_EXT:
diff --git a/mesalib/src/mesa/main/uniforms.c b/mesalib/src/mesa/main/uniforms.c
index 62c85b3c0..d902407a0 100644
--- a/mesalib/src/mesa/main/uniforms.c
+++ b/mesalib/src/mesa/main/uniforms.c
@@ -695,7 +695,7 @@ _mesa_GetActiveUniformBlockiv(GLuint program,
       for (i = 0; i < block->NumUniforms; i++) {
 	 unsigned offset;
 	 params[i] = _mesa_get_uniform_location(ctx, shProg,
-						block->Uniforms[i].Name,
+						block->Uniforms[i].IndexName,
 						&offset);
       }
       return;
diff --git a/mesalib/src/mesa/main/version.c b/mesalib/src/mesa/main/version.c
index 4373d7b91..e944a5518 100644
--- a/mesalib/src/mesa/main/version.c
+++ b/mesalib/src/mesa/main/version.c
@@ -323,7 +323,30 @@ compute_version_es2(struct gl_context *ctx)
                               ctx->Extensions.ARB_fragment_shader &&
                               ctx->Extensions.ARB_texture_non_power_of_two &&
                               ctx->Extensions.EXT_blend_equation_separate);
-   if (ver_2_0) {
+   /* FINISHME: This list isn't quite right. */
+   const GLboolean ver_3_0 = (ctx->Extensions.ARB_half_float_vertex &&
+                              ctx->Extensions.ARB_internalformat_query &&
+                              ctx->Extensions.ARB_map_buffer_range &&
+                              ctx->Extensions.ARB_shader_texture_lod &&
+                              ctx->Extensions.ARB_texture_float &&
+                              ctx->Extensions.ARB_texture_rg &&
+                              ctx->Extensions.ARB_texture_compression_rgtc &&
+                              ctx->Extensions.EXT_draw_buffers2 &&
+                              /* ctx->Extensions.ARB_framebuffer_object && */
+                              ctx->Extensions.EXT_framebuffer_sRGB &&
+                              ctx->Extensions.EXT_packed_float &&
+                              ctx->Extensions.EXT_texture_array &&
+                              ctx->Extensions.EXT_texture_shared_exponent &&
+                              ctx->Extensions.EXT_transform_feedback &&
+                              ctx->Extensions.NV_conditional_render &&
+                              ctx->Extensions.ARB_draw_instanced &&
+                              ctx->Extensions.ARB_uniform_buffer_object &&
+                              ctx->Extensions.EXT_texture_snorm &&
+                              ctx->Extensions.NV_primitive_restart &&
+                              ctx->Extensions.OES_depth_texture_cube_map);
+   if (ver_3_0) {
+      ctx->Version = 30;
+   } else if (ver_2_0) {
       ctx->Version = 20;
    } else {
       _mesa_problem(ctx, "Incomplete OpenGL ES 2.0 support.");
diff --git a/mesalib/src/mesa/program/ir_to_mesa.cpp b/mesalib/src/mesa/program/ir_to_mesa.cpp
index 0f7439b3b..cd89171da 100644
--- a/mesalib/src/mesa/program/ir_to_mesa.cpp
+++ b/mesalib/src/mesa/program/ir_to_mesa.cpp
@@ -623,10 +623,14 @@ type_size(const struct glsl_type *type)
        * at link time.
        */
       return 1;
-   default:
-      assert(0);
-      return 0;
+   case GLSL_TYPE_VOID:
+   case GLSL_TYPE_ERROR:
+   case GLSL_TYPE_INTERFACE:
+      assert(!"Invalid type in type_size");
+      break;
    }
+
+   return 0;
 }
 
 /**
@@ -1427,7 +1431,21 @@ ir_to_mesa_visitor::visit(ir_expression *ir)
    case ir_unop_fract:
       emit(ir, OPCODE_FRC, result_dst, op[0]);
       break;
-
+   case ir_unop_pack_snorm_2x16:
+   case ir_unop_pack_snorm_4x8:
+   case ir_unop_pack_unorm_2x16:
+   case ir_unop_pack_unorm_4x8:
+   case ir_unop_pack_half_2x16:
+   case ir_unop_unpack_snorm_2x16:
+   case ir_unop_unpack_snorm_4x8:
+   case ir_unop_unpack_unorm_2x16:
+   case ir_unop_unpack_unorm_4x8:
+   case ir_unop_unpack_half_2x16:
+   case ir_unop_unpack_half_2x16_split_x:
+   case ir_unop_unpack_half_2x16_split_y:
+   case ir_binop_pack_half_2x16_split:
+      assert(!"not supported");
+      break;
    case ir_binop_min:
       emit(ir, OPCODE_MIN, result_dst, op[0], op[1]);
       break;
@@ -1529,21 +1547,18 @@ ir_to_mesa_visitor::visit(ir_dereference_variable *ir)
 					       var->location);
 	 this->variables.push_tail(entry);
 	 break;
-      case ir_var_in:
-      case ir_var_inout:
+      case ir_var_shader_in:
 	 /* The linker assigns locations for varyings and attributes,
 	  * including deprecated builtins (like gl_Color),
 	  * user-assigned generic attributes (glBindVertexLocation),
 	  * and user-defined varyings.
-	  *
-	  * FINISHME: We would hit this path for function arguments.  Fix!
 	  */
 	 assert(var->location != -1);
          entry = new(mem_ctx) variable_storage(var,
                                                PROGRAM_INPUT,
                                                var->location);
          break;
-      case ir_var_out:
+      case ir_var_shader_out:
 	 assert(var->location != -1);
          entry = new(mem_ctx) variable_storage(var,
                                                PROGRAM_OUTPUT,
@@ -2378,7 +2393,8 @@ public:
    }
 
 private:
-   virtual void visit_field(const glsl_type *type, const char *name);
+   virtual void visit_field(const glsl_type *type, const char *name,
+                            bool row_major);
 
    struct gl_shader_program *shader_program;
    struct gl_program_parameter_list *params;
@@ -2386,10 +2402,13 @@ private:
 };
 
 void
-add_uniform_to_shader::visit_field(const glsl_type *type, const char *name)
+add_uniform_to_shader::visit_field(const glsl_type *type, const char *name,
+                                   bool row_major)
 {
    unsigned int size;
 
+   (void) row_major;
+
    if (type->is_vector() || type->is_scalar()) {
       size = type->vector_elements;
    } else {
@@ -2459,7 +2478,7 @@ _mesa_generate_parameters_list_for_uniforms(struct gl_shader_program
       ir_variable *var = ((ir_instruction *) node)->as_variable();
 
       if ((var == NULL) || (var->mode != ir_var_uniform)
-	  || var->uniform_block != -1 || (strncmp(var->name, "gl_", 3) == 0))
+	  || var->is_in_uniform_block() || (strncmp(var->name, "gl_", 3) == 0))
 	 continue;
 
       add.process(var);
@@ -2522,7 +2541,11 @@ _mesa_associate_uniform_storage(struct gl_context *ctx,
 	    format = uniform_native;
 	    columns = 1;
 	    break;
-	 default:
+         case GLSL_TYPE_ARRAY:
+         case GLSL_TYPE_VOID:
+         case GLSL_TYPE_STRUCT:
+         case GLSL_TYPE_ERROR:
+         case GLSL_TYPE_INTERFACE:
 	    assert(!"Should not get here.");
 	    break;
 	 }
diff --git a/mesalib/src/mesa/program/program.c b/mesalib/src/mesa/program/program.c
index 993803dd5..fb0aeb7ed 100644
--- a/mesalib/src/mesa/program/program.c
+++ b/mesalib/src/mesa/program/program.c
@@ -696,7 +696,7 @@ _mesa_combine_programs(struct gl_context *ctx,
    const GLuint newLength = lenA + lenB;
    GLboolean usedTemps[MAX_PROGRAM_TEMPS];
    GLuint firstTemp = 0;
-   GLbitfield inputsB;
+   GLbitfield64 inputsB;
    GLuint i;
 
    ASSERT(progA->Target == progB->Target);
@@ -724,7 +724,7 @@ _mesa_combine_programs(struct gl_context *ctx,
    if (newProg->Target == GL_FRAGMENT_PROGRAM_ARB) {
       const struct gl_fragment_program *fprogA, *fprogB;
       struct gl_fragment_program *newFprog;
-      GLbitfield progB_inputsRead = progB->InputsRead;
+      GLbitfield64 progB_inputsRead = progB->InputsRead;
       GLint progB_colorFile, progB_colorIndex;
 
       fprogA = gl_fragment_program_const(progA);
@@ -840,8 +840,8 @@ _mesa_find_used_registers(const struct gl_program *prog,
 
       for (j = 0; j < n; j++) {
          if (inst->SrcReg[j].File == file) {
-            ASSERT(inst->SrcReg[j].Index < usedSize);
-            if(inst->SrcReg[j].Index < usedSize)
+            ASSERT(inst->SrcReg[j].Index < (GLint) usedSize);
+            if (inst->SrcReg[j].Index < (GLint) usedSize)
                used[inst->SrcReg[j].Index] = GL_TRUE;
          }
       }
@@ -908,23 +908,23 @@ _mesa_valid_register_index(const struct gl_context *ctx,
       return GL_TRUE;  /* XXX or maybe false? */
 
    case PROGRAM_TEMPORARY:
-      return index >= 0 && index < c->MaxTemps;
+      return index >= 0 && index < (GLint) c->MaxTemps;
 
    case PROGRAM_ENV_PARAM:
-      return index >= 0 && index < c->MaxEnvParams;
+      return index >= 0 && index < (GLint) c->MaxEnvParams;
 
    case PROGRAM_LOCAL_PARAM:
-      return index >= 0 && index < c->MaxLocalParams;
+      return index >= 0 && index < (GLint) c->MaxLocalParams;
 
    case PROGRAM_UNIFORM:
    case PROGRAM_STATE_VAR:
       /* aka constant buffer */
-      return index >= 0 && index < c->MaxUniformComponents / 4;
+      return index >= 0 && index < (GLint) c->MaxUniformComponents / 4;
 
    case PROGRAM_CONSTANT:
       /* constant buffer w/ possible relative negative addressing */
       return (index > (int) c->MaxUniformComponents / -4 &&
-              index < c->MaxUniformComponents / 4);
+              index < (int) c->MaxUniformComponents / 4);
 
    case PROGRAM_INPUT:
       if (index < 0)
@@ -932,11 +932,11 @@ _mesa_valid_register_index(const struct gl_context *ctx,
 
       switch (shaderType) {
       case MESA_SHADER_VERTEX:
-         return index < VERT_ATTRIB_GENERIC0 + c->MaxAttribs;
+         return index < VERT_ATTRIB_GENERIC0 + (GLint) c->MaxAttribs;
       case MESA_SHADER_FRAGMENT:
-         return index < FRAG_ATTRIB_VAR0 + ctx->Const.MaxVarying;
+         return index < FRAG_ATTRIB_VAR0 + (GLint) ctx->Const.MaxVarying;
       case MESA_SHADER_GEOMETRY:
-         return index < GEOM_ATTRIB_VAR0 + ctx->Const.MaxVarying;
+         return index < GEOM_ATTRIB_VAR0 + (GLint) ctx->Const.MaxVarying;
       default:
          return GL_FALSE;
       }
@@ -947,17 +947,17 @@ _mesa_valid_register_index(const struct gl_context *ctx,
 
       switch (shaderType) {
       case MESA_SHADER_VERTEX:
-         return index < VERT_RESULT_VAR0 + ctx->Const.MaxVarying;
+         return index < VERT_RESULT_VAR0 + (GLint) ctx->Const.MaxVarying;
       case MESA_SHADER_FRAGMENT:
-         return index < FRAG_RESULT_DATA0 + ctx->Const.MaxDrawBuffers;
+         return index < FRAG_RESULT_DATA0 + (GLint) ctx->Const.MaxDrawBuffers;
       case MESA_SHADER_GEOMETRY:
-         return index < GEOM_RESULT_VAR0 + ctx->Const.MaxVarying;
+         return index < GEOM_RESULT_VAR0 + (GLint) ctx->Const.MaxVarying;
       default:
          return GL_FALSE;
       }
 
    case PROGRAM_ADDRESS:
-      return index >= 0 && index < c->MaxAddressRegs;
+      return index >= 0 && index < (GLint) c->MaxAddressRegs;
 
    default:
       _mesa_problem(ctx,
diff --git a/mesalib/src/mesa/state_tracker/st_cb_bitmap.c b/mesalib/src/mesa/state_tracker/st_cb_bitmap.c
index 843dc5be3..63dbdb29b 100644
--- a/mesalib/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/mesalib/src/mesa/state_tracker/st_cb_bitmap.c
@@ -350,9 +350,8 @@ setup_bitmap_vertex_data(struct st_context *st, bool normalized,
       tBot = (GLfloat) height;
    }
 
-   u_upload_alloc(st->uploader, 0, 4 * sizeof(vertices[0]), vbuf_offset, vbuf,
-		  (void**)&vertices);
-   if (!vbuf) {
+   if (u_upload_alloc(st->uploader, 0, 4 * sizeof(vertices[0]),
+                      vbuf_offset, vbuf, (void **) &vertices) != PIPE_OK) {
       return;
    }
 
diff --git a/mesalib/src/mesa/state_tracker/st_cb_clear.c b/mesalib/src/mesa/state_tracker/st_cb_clear.c
index d01236e28..a5aa8f496 100644
--- a/mesalib/src/mesa/state_tracker/st_cb_clear.c
+++ b/mesalib/src/mesa/state_tracker/st_cb_clear.c
@@ -141,9 +141,8 @@ draw_quad(struct st_context *st,
    GLuint i, offset;
    float (*vertices)[2][4];  /**< vertex pos + color */
 
-   u_upload_alloc(st->uploader, 0, 4 * sizeof(vertices[0]), &offset, &vbuf,
-		  (void**)&vertices);
-   if (!vbuf) {
+   if (u_upload_alloc(st->uploader, 0, 4 * sizeof(vertices[0]),
+                      &offset, &vbuf, (void **) &vertices) != PIPE_OK) {
       return;
    }
 
diff --git a/mesalib/src/mesa/state_tracker/st_cb_drawpixels.c b/mesalib/src/mesa/state_tracker/st_cb_drawpixels.c
index ff8a9dc43..c944b81f6 100644
--- a/mesalib/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/mesalib/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -568,9 +568,8 @@ draw_quad(struct gl_context *ctx, GLfloat x0, GLfloat y0, GLfloat z,
    struct pipe_resource *buf = NULL;
    unsigned offset;
 
-   u_upload_alloc(st->uploader, 0, 4 * sizeof(verts[0]), &offset, &buf,
-		  (void**)&verts);
-   if (!buf) {
+   if (u_upload_alloc(st->uploader, 0, 4 * sizeof(verts[0]), &offset,
+                      &buf, (void **) &verts) != PIPE_OK) {
       return;
    }
 
@@ -795,7 +794,7 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    y1 = y + height * ctx->Pixel.ZoomY;
 
    /* convert Z from [0,1] to [-1,-1] to match viewport Z scale/bias */
-   z = z * 2.0 - 1.0;
+   z = z * 2.0f - 1.0f;
 
    draw_quad(ctx, x0, y0, z, x1, y1, color, invertTex,
              normalized ? ((GLfloat) width / sv[0]->texture->width0) : (GLfloat)width,
@@ -1063,7 +1062,7 @@ static void
 clamp_size(struct pipe_context *pipe, GLsizei *width, GLsizei *height,
            struct gl_pixelstore_attrib *unpack)
 {
-   const unsigned maxSize = 
+   const int maxSize =
       1 << (pipe->screen->get_param(pipe->screen,
                                     PIPE_CAP_MAX_TEXTURE_2D_LEVELS) - 1);
 
diff --git a/mesalib/src/mesa/state_tracker/st_cb_drawtex.c b/mesalib/src/mesa/state_tracker/st_cb_drawtex.c
index 269068da2..5ca097004 100644
--- a/mesalib/src/mesa/state_tracker/st_cb_drawtex.c
+++ b/mesalib/src/mesa/state_tracker/st_cb_drawtex.c
@@ -148,10 +148,9 @@ st_DrawTex(struct gl_context *ctx, GLfloat x, GLfloat y, GLfloat z,
       GLfloat *vbuf = NULL;
       GLuint attr;
 
-      u_upload_alloc(st->uploader, 0,
-		     numAttribs * 4 * 4 * sizeof(GLfloat),
-		     &offset, &vbuffer, (void**)&vbuf);
-      if (!vbuffer) {
+      if (u_upload_alloc(st->uploader, 0,
+                         numAttribs * 4 * 4 * sizeof(GLfloat),
+                         &offset, &vbuffer, (void **) &vbuf) != PIPE_OK) {
          return;
       }
       
diff --git a/mesalib/src/mesa/state_tracker/st_cb_texture.c b/mesalib/src/mesa/state_tracker/st_cb_texture.c
index 7f07b741e..3cea2df07 100644
--- a/mesalib/src/mesa/state_tracker/st_cb_texture.c
+++ b/mesalib/src/mesa/state_tracker/st_cb_texture.c
@@ -1555,6 +1555,7 @@ void
 st_init_texture_functions(struct dd_function_table *functions)
 {
    functions->ChooseTextureFormat = st_ChooseTextureFormat;
+   functions->QuerySamplesForFormat = st_QuerySamplesForFormat;
    functions->TexImage = st_TexImage;
    functions->TexSubImage = _mesa_store_texsubimage;
    functions->CompressedTexSubImage = _mesa_store_compressed_texsubimage;
diff --git a/mesalib/src/mesa/state_tracker/st_draw.c b/mesalib/src/mesa/state_tracker/st_draw.c
index de539ca5a..de62264a1 100644
--- a/mesalib/src/mesa/state_tracker/st_draw.c
+++ b/mesalib/src/mesa/state_tracker/st_draw.c
@@ -84,7 +84,12 @@ all_varyings_in_vbos(const struct gl_client_array *arrays[])
 }
 
 
-static void
+/**
+ * Basically, translate Mesa's index buffer information into
+ * a pipe_index_buffer object.
+ * \return TRUE or FALSE for success/failure
+ */
+static boolean
 setup_index_buffer(struct st_context *st,
                    const struct _mesa_index_buffer *ib,
                    struct pipe_index_buffer *ibuffer)
@@ -100,8 +105,12 @@ setup_index_buffer(struct st_context *st,
       ibuffer->offset = pointer_to_offset(ib->ptr);
    }
    else if (st->indexbuf_uploader) {
-      u_upload_data(st->indexbuf_uploader, 0, ib->count * ibuffer->index_size,
-                    ib->ptr, &ibuffer->offset, &ibuffer->buffer);
+      if (u_upload_data(st->indexbuf_uploader, 0,
+                        ib->count * ibuffer->index_size, ib->ptr,
+                        &ibuffer->offset, &ibuffer->buffer) != PIPE_OK) {
+         /* out of memory */
+         return FALSE;
+      }
       u_upload_unmap(st->indexbuf_uploader);
    }
    else {
@@ -110,6 +119,7 @@ setup_index_buffer(struct st_context *st,
    }
 
    cso_set_index_buffer(st->cso_context, ibuffer);
+   return TRUE;
 }
 
 
@@ -220,7 +230,10 @@ st_draw_vbo(struct gl_context *ctx,
             vbo_get_minmax_indices(ctx, prims, ib, &min_index, &max_index,
                                    nr_prims);
 
-      setup_index_buffer(st, ib, &ibuffer);
+      if (!setup_index_buffer(st, ib, &ibuffer)) {
+         /* out of memory */
+         return;
+      }
 
       info.indexed = TRUE;
       if (min_index != ~0 && max_index != ~0) {
diff --git a/mesalib/src/mesa/state_tracker/st_extensions.c b/mesalib/src/mesa/state_tracker/st_extensions.c
index 18d89815d..af54cf7c8 100644
--- a/mesalib/src/mesa/state_tracker/st_extensions.c
+++ b/mesalib/src/mesa/state_tracker/st_extensions.c
@@ -516,6 +516,7 @@ void st_init_extensions(struct st_context *st)
    ctx->Extensions.ARB_fragment_shader = GL_TRUE;
    ctx->Extensions.ARB_half_float_pixel = GL_TRUE;
    ctx->Extensions.ARB_half_float_vertex = GL_TRUE;
+   ctx->Extensions.ARB_internalformat_query = GL_TRUE;
    ctx->Extensions.ARB_map_buffer_range = GL_TRUE;
    ctx->Extensions.ARB_shader_objects = GL_TRUE;
    ctx->Extensions.ARB_shading_language_100 = GL_TRUE;
@@ -594,9 +595,10 @@ void st_init_extensions(struct st_context *st)
       ctx->Const.NativeIntegers = GL_TRUE;
       ctx->Const.MaxClipPlanes = 8;
 
-      /* Extensions that only depend on GLSL 1.3. */
+      /* Extensions that either depend on GLSL 1.30 or are a subset thereof. */
       ctx->Extensions.ARB_conservative_depth = GL_TRUE;
       ctx->Extensions.ARB_shader_bit_encoding = GL_TRUE;
+      ctx->Extensions.OES_depth_texture_cube_map = GL_TRUE;
    } else {
       /* Optional integer support for GLSL 1.2. */
       if (screen->get_shader_param(screen, PIPE_SHADER_VERTEX,
diff --git a/mesalib/src/mesa/state_tracker/st_format.c b/mesalib/src/mesa/state_tracker/st_format.c
index af81f732d..7ef063953 100644
--- a/mesalib/src/mesa/state_tracker/st_format.c
+++ b/mesalib/src/mesa/state_tracker/st_format.c
@@ -1642,6 +1642,40 @@ st_ChooseTextureFormat(struct gl_context *ctx, GLenum target,
 }
 
 
+/**
+ * Called via ctx->Driver.ChooseTextureFormat().
+ */
+size_t
+st_QuerySamplesForFormat(struct gl_context *ctx, GLenum internalFormat,
+                         int samples[16])
+{
+   struct pipe_screen *screen = st_context(ctx)->pipe->screen;
+   enum pipe_format format;
+   unsigned i, bind, num_sample_counts = 0;
+
+   if (_mesa_is_depth_or_stencil_format(internalFormat))
+      bind = PIPE_BIND_DEPTH_STENCIL;
+   else
+      bind = PIPE_BIND_RENDER_TARGET;
+
+   /* Set sample counts in descending order. */
+   for (i = 16; i > 1; i--) {
+      format = st_choose_format(screen, internalFormat, GL_NONE, GL_NONE,
+                                PIPE_TEXTURE_2D, i, bind);
+
+      if (format != PIPE_FORMAT_NONE) {
+         samples[num_sample_counts++] = i;
+      }
+   }
+
+   if (!num_sample_counts) {
+      samples[num_sample_counts++] = 1;
+   }
+
+   return num_sample_counts;
+}
+
+
 GLboolean
 st_sampler_compat_formats(enum pipe_format format1, enum pipe_format format2)
 {
diff --git a/mesalib/src/mesa/state_tracker/st_format.h b/mesalib/src/mesa/state_tracker/st_format.h
index 39397b17a..cb6e5bc96 100644
--- a/mesalib/src/mesa/state_tracker/st_format.h
+++ b/mesalib/src/mesa/state_tracker/st_format.h
@@ -67,6 +67,9 @@ st_ChooseTextureFormat(struct gl_context * ctx, GLenum target,
                        GLint internalFormat,
                        GLenum format, GLenum type);
 
+size_t
+st_QuerySamplesForFormat(struct gl_context *ctx, GLenum internalFormat,
+                         int samples[16]);
 
 /* can we use a sampler view to translate these formats
    only used to make TFP so far */
diff --git a/mesalib/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/mesalib/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 1d96e905c..c6ac634a2 100644
--- a/mesalib/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/mesalib/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -984,10 +984,13 @@ type_size(const struct glsl_type *type)
        * at link time.
        */
       return 1;
-   default:
-      assert(0);
-      return 0;
+   case GLSL_TYPE_INTERFACE:
+   case GLSL_TYPE_VOID:
+   case GLSL_TYPE_ERROR:
+      assert(!"Invalid type in type_size");
+      break;
    }
+   return 0;
 }
 
 /**
@@ -1932,10 +1935,23 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       }
       break;
    }
+   case ir_unop_pack_snorm_2x16:
+   case ir_unop_pack_unorm_2x16:
+   case ir_unop_pack_half_2x16:
+   case ir_unop_pack_snorm_4x8:
+   case ir_unop_pack_unorm_4x8:
+   case ir_unop_unpack_snorm_2x16:
+   case ir_unop_unpack_unorm_2x16:
+   case ir_unop_unpack_half_2x16:
+   case ir_unop_unpack_half_2x16_split_x:
+   case ir_unop_unpack_half_2x16_split_y:
+   case ir_unop_unpack_snorm_4x8:
+   case ir_unop_unpack_unorm_4x8:
+   case ir_binop_pack_half_2x16_split:
    case ir_quadop_vector:
-      /* This operation should have already been handled.
+      /* This operation is not supported, or should have already been handled.
        */
-      assert(!"Should not get here.");
+      assert(!"Invalid ir opcode in glsl_to_tgsi_visitor::visit()");
       break;
    }
 
@@ -2001,21 +2017,18 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
         				       var->location);
          this->variables.push_tail(entry);
          break;
-      case ir_var_in:
-      case ir_var_inout:
+      case ir_var_shader_in:
          /* The linker assigns locations for varyings and attributes,
           * including deprecated builtins (like gl_Color), user-assign
           * generic attributes (glBindVertexLocation), and
           * user-defined varyings.
-          *
-          * FINISHME: We would hit this path for function arguments.  Fix!
           */
          assert(var->location != -1);
          entry = new(mem_ctx) variable_storage(var,
                                                PROGRAM_INPUT,
                                                var->location);
          break;
-      case ir_var_out:
+      case ir_var_shader_out:
          assert(var->location != -1);
          entry = new(mem_ctx) variable_storage(var,
                                                PROGRAM_OUTPUT,
@@ -2304,7 +2317,7 @@ glsl_to_tgsi_visitor::visit(ir_assignment *ir)
       assert(!ir->lhs->type->is_scalar() && !ir->lhs->type->is_vector());
       l.writemask = WRITEMASK_XYZW;
    } else if (ir->lhs->type->is_scalar() &&
-              ir->lhs->variable_referenced()->mode == ir_var_out) {
+              ir->lhs->variable_referenced()->mode == ir_var_shader_out) {
       /* FINISHME: This hack makes writing to gl_FragDepth, which lives in the
        * FINISHME: W component of fragment shader output zero, work correctly.
        */
@@ -2581,8 +2594,8 @@ glsl_to_tgsi_visitor::visit(ir_call *ir)
       ir_rvalue *param_rval = (ir_rvalue *)iter.get();
       ir_variable *param = (ir_variable *)sig_iter.get();
 
-      if (param->mode == ir_var_in ||
-          param->mode == ir_var_inout) {
+      if (param->mode == ir_var_function_in ||
+          param->mode == ir_var_function_inout) {
          variable_storage *storage = find_variable_storage(param);
          assert(storage);
 
@@ -2617,8 +2630,8 @@ glsl_to_tgsi_visitor::visit(ir_call *ir)
       ir_rvalue *param_rval = (ir_rvalue *)iter.get();
       ir_variable *param = (ir_variable *)sig_iter.get();
 
-      if (param->mode == ir_var_out ||
-          param->mode == ir_var_inout) {
+      if (param->mode == ir_var_function_out ||
+          param->mode == ir_var_function_inout) {
          variable_storage *storage = find_variable_storage(param);
          assert(storage);
 
diff --git a/mesalib/src/mesa/swrast/s_texfilter.c b/mesalib/src/mesa/swrast/s_texfilter.c
index 0a91cca06..953300f65 100644
--- a/mesalib/src/mesa/swrast/s_texfilter.c
+++ b/mesalib/src/mesa/swrast/s_texfilter.c
@@ -1647,14 +1647,14 @@ sample_2d_ewa(struct gl_context *ctx,
               GLfloat rgba[])
 {
    GLint level = lod > 0 ? lod : 0;
-   GLfloat scaling = 1.0 / (1 << level);
+   GLfloat scaling = 1.0f / (1 << level);
    const struct gl_texture_image *img =	tObj->Image[0][level];
    const struct gl_texture_image *mostDetailedImage =
       tObj->Image[0][tObj->BaseLevel];
    const struct swrast_texture_image *swImg =
       swrast_texture_image_const(mostDetailedImage);
-   GLfloat tex_u=-0.5 + texcoord[0] * swImg->WidthScale * scaling;
-   GLfloat tex_v=-0.5 + texcoord[1] * swImg->HeightScale * scaling;
+   GLfloat tex_u = -0.5f + texcoord[0] * swImg->WidthScale * scaling;
+   GLfloat tex_v = -0.5f + texcoord[1] * swImg->HeightScale * scaling;
 
    GLfloat ux = dudx * scaling;
    GLfloat vx = dvdx * scaling;
@@ -1667,20 +1667,20 @@ sample_2d_ewa(struct gl_context *ctx,
    GLfloat A = vx*vx+vy*vy+1;
    GLfloat B = -2*(ux*vx+uy*vy);
    GLfloat C = ux*ux+uy*uy+1;
-   GLfloat F = A*C-B*B/4.0;
+   GLfloat F = A*C-B*B/4.0f;
 
    /* check if it is an ellipse */
    /* ASSERT(F > 0.0); */
 
    /* Compute the ellipse's (u,v) bounding box in texture space */
-   GLfloat d = -B*B+4.0*C*A;
-   GLfloat box_u = 2.0 / d * sqrt(d*C*F); /* box_u -> half of bbox with   */
-   GLfloat box_v = 2.0 / d * sqrt(A*d*F); /* box_v -> half of bbox height */
+   GLfloat d = -B*B+4.0f*C*A;
+   GLfloat box_u = 2.0f / d * sqrtf(d*C*F); /* box_u -> half of bbox with   */
+   GLfloat box_v = 2.0f / d * sqrtf(A*d*F); /* box_v -> half of bbox height */
 
-   GLint u0 = floor(tex_u - box_u);
-   GLint u1 = ceil (tex_u + box_u);
-   GLint v0 = floor(tex_v - box_v);
-   GLint v1 = ceil (tex_v + box_v);
+   GLint u0 = (GLint) floorf(tex_u - box_u);
+   GLint u1 = (GLint) ceilf (tex_u + box_u);
+   GLint v0 = (GLint) floorf(tex_v - box_v);
+   GLint v1 = (GLint) ceilf (tex_v + box_v);
 
    GLfloat num[4] = {0.0F, 0.0F, 0.0F, 0.0F};
    GLfloat newCoord[2];
@@ -1692,7 +1692,7 @@ sample_2d_ewa(struct gl_context *ctx,
    /* Scale ellipse formula to directly index the Filter Lookup Table.
     * i.e. scale so that F = WEIGHT_LUT_SIZE-1
     */
-   double formScale = (double) (WEIGHT_LUT_SIZE - 1) / F;
+   GLfloat formScale = (GLfloat) (WEIGHT_LUT_SIZE - 1) / F;
    A *= formScale;
    B *= formScale;
    C *= formScale;
@@ -1715,7 +1715,7 @@ sample_2d_ewa(struct gl_context *ctx,
             /* as a LUT is used, q must never be negative;
              * should not happen, though
              */
-            const GLint qClamped = q >= 0.0F ? q : 0;
+            const GLint qClamped = q >= 0.0F ? (GLint) q : 0;
             GLfloat weight = weightLut[qClamped];
 
             newCoord[0] = u / ((GLfloat) img->Width2);
@@ -1795,19 +1795,19 @@ sample_2d_footprint(struct gl_context *ctx,
 
    /*  Calculate the per anisotropic sample offsets in s,t space. */
    if (Px2 > Py2) {
-      numSamples = ceil(sqrtf(Px2));
+      numSamples = (GLint) ceilf(sqrtf(Px2));
       ds = ux / ((GLfloat) img->Width2);
       dt = vx / ((GLfloat) img->Height2);
    }
    else {
-      numSamples = ceil(sqrtf(Py2));
+      numSamples = (GLint) ceilf(sqrtf(Py2));
       ds = uy / ((GLfloat) img->Width2);
       dt = vy / ((GLfloat) img->Height2);
    }
 
    for (s = 0; s<numSamples; s++) {
-      newCoord[0] = texcoord[0] + ds * ((GLfloat)(s+1) / (numSamples+1) -0.5);
-      newCoord[1] = texcoord[1] + dt * ((GLfloat)(s+1) / (numSamples+1) -0.5);
+      newCoord[0] = texcoord[0] + ds * ((GLfloat)(s+1) / (numSamples+1) -0.5f);
+      newCoord[1] = texcoord[1] + dt * ((GLfloat)(s+1) / (numSamples+1) -0.5f);
 
       sample_2d_linear(ctx, samp, img, newCoord, rgba);
       num[0] += rgba[0];
@@ -1956,7 +1956,7 @@ sample_lambda_2d_aniso(struct gl_context *ctx,
       /* note: we need to have Pmin=sqrt(Pmin2) here, but we can avoid
        * this since 0.5*log(x) = log(sqrt(x))
        */
-      lod = 0.5 * LOG2(Pmin2);
+      lod = 0.5f * LOG2(Pmin2);
       
       if (adjustLOD) {
          /* from swrast/s_texcombine.c _swrast_texture_span */
@@ -1988,7 +1988,7 @@ sample_lambda_2d_aniso(struct gl_context *ctx,
           * seem to be worth the extra running time.
           */
          sample_2d_ewa(ctx, samp, tObj, texcoords[i],
-                       dudx, dvdx, dudy, dvdy, floor(lod), rgba[i]);
+                       dudx, dvdx, dudy, dvdy, (GLint) floorf(lod), rgba[i]);
 
          /* unused: */
          (void) sample_2d_footprint;
diff --git a/mesalib/src/mesa/vbo/vbo_exec_api.c b/mesalib/src/mesa/vbo/vbo_exec_api.c
index 985f2209c..353f8cfde 100644
--- a/mesalib/src/mesa/vbo/vbo_exec_api.c
+++ b/mesalib/src/mesa/vbo/vbo_exec_api.c
@@ -124,6 +124,11 @@ void vbo_exec_vtx_wrap( struct vbo_exec_context *exec )
     */
    vbo_exec_wrap_buffers( exec );
    
+   if (!exec->vtx.buffer_ptr) {
+      /* probably ran out of memory earlier when allocating the VBO */
+      return;
+   }
+
    /* Copy stored stored vertices to start of new list. 
     */
    assert(exec->vtx.max_vert - exec->vtx.vert_count > exec->vtx.copied.nr);
diff --git a/mkfontscale/configure.ac b/mkfontscale/configure.ac
index 4340f99e1..4c7e599d5 100644
--- a/mkfontscale/configure.ac
+++ b/mkfontscale/configure.ac
@@ -27,6 +27,7 @@ AC_INIT([mkfontscale], [1.1.0],
         [mkfontscale])
 AC_CONFIG_SRCDIR([Makefile.am])
 AC_CONFIG_HEADERS([config.h])
+AC_USE_SYSTEM_EXTENSIONS
 
 # Initialize Automake
 AM_INIT_AUTOMAKE([foreign dist-bzip2])
diff --git a/mkfontscale/hash.c b/mkfontscale/hash.c
index c2cf9caa3..3adfb6861 100644
--- a/mkfontscale/hash.c
+++ b/mkfontscale/hash.c
@@ -20,6 +20,8 @@
   THE SOFTWARE.
 */
 
+#include "config.h"
+
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@@ -41,14 +43,11 @@ hash(const char *string)
 }
 
 static void
-strcpy_lwr(char *dst, const char *src)
+str_tolower(char *s)
 {
-    while(1) {
-        *dst = tolower(*src);
-        if(*src == '\0')
-            break;
-        src++;
-        dst++;
+    while(*s != '\0') {
+        *s = tolower(*s);
+        s++;
     }
 }
 
@@ -97,12 +96,11 @@ putHash(HashTablePtr table, char *key, char *value, int prio)
     for(bp = table[i]; bp; bp = bp->next) {
         if(strcasecmp(bp->key, key) == 0) {
             if(prio > bp->prio) {
-                keycopy = malloc(strlen(key) + 1);
+                keycopy = strdup(key);
                 if(keycopy == NULL) goto fail;
-                strcpy_lwr(keycopy, key);
-                valuecopy = malloc(strlen(value) + 1);
+                str_tolower(keycopy);
+                valuecopy = strdup(value);
                 if(valuecopy == NULL) goto fail;
-                strcpy(valuecopy, value);
                 free(bp->key);
                 free(bp->value);
                 bp->key = keycopy;
@@ -111,14 +109,13 @@ putHash(HashTablePtr table, char *key, char *value, int prio)
             return 1;
         }
     }
-    keycopy = malloc(strlen(key) + 1);
+    keycopy = strdup(key);
     if(keycopy == NULL)
         goto fail;
-    strcpy_lwr(keycopy, key);
-    valuecopy = malloc(strlen(value) + 1);
+    str_tolower(keycopy);
+    valuecopy = strdup(value);
     if(valuecopy == NULL)
         goto fail;
-    strcpy(valuecopy, value);
     bp = malloc(sizeof(HashBucketRec));
     if(bp == NULL)
         goto fail;
diff --git a/mkfontscale/ident.c b/mkfontscale/ident.c
index bf544832c..41212575e 100644
--- a/mkfontscale/ident.c
+++ b/mkfontscale/ident.c
@@ -315,10 +315,9 @@ pcfIdentify(fontFile *f, char **name)
     if(i >= nprops)
         goto fail;
 
-    s = malloc(strlen(strings + props[i].value) + 1);
+    s = strdup(strings + props[i].value);
     if(s == NULL)
         goto fail;
-    strcpy(s, strings + props[i].value);
     *name = s;
     free(strings);
     free(props);
diff --git a/mkfontscale/mkfontscale.c b/mkfontscale/mkfontscale.c
index 5cf5cb9af..a67f28338 100644
--- a/mkfontscale/mkfontscale.c
+++ b/mkfontscale/mkfontscale.c
@@ -20,6 +20,8 @@
   THE SOFTWARE.
 */
 
+#include "config.h"
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -896,10 +898,9 @@ doDirectory(const char *dirname_given, int numEncodings, ListPtr encodingsToDo)
                 BDF_PropertyRec prop;
                 rc = FT_Get_BDF_Property(face, "FONT", &prop);
                 if(rc == 0 && prop.type == BDF_PROPERTY_TYPE_ATOM) {
-                    xlfd_name = malloc(strlen(prop.u.atom) + 1);
+                    xlfd_name = strdup(prop.u.atom);
                     if(xlfd_name == NULL)
                         goto done;
-                    strcpy(xlfd_name, prop.u.atom);
                 }
             }
         }
diff --git a/pixman/configure.ac b/pixman/configure.ac
index 515e31218..a93e2905b 100644
--- a/pixman/configure.ac
+++ b/pixman/configure.ac
@@ -968,6 +968,22 @@ fi
 AC_MSG_RESULT($support_for_attribute_constructor)
 AC_SUBST(TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR)
 
+dnl =====================================
+dnl __float128
+
+support_for_float128=no
+
+AC_MSG_CHECKING(for __float128)
+AC_LINK_IFELSE([AC_LANG_SOURCE([[
+__float128 a = 1.0Q, b = 2.0Q; int main (void) { return a + b; }
+]])], support_for_float128=yes)
+
+if test x$support_for_float128 = xyes; then
+   AC_DEFINE([HAVE_FLOAT128], [], [Whether the tool chain supports __float128])
+fi
+
+AC_MSG_RESULT($support_for_float128)
+
 dnl ==================
 dnl libpng
 
diff --git a/pixman/demos/scale.c b/pixman/demos/scale.c
index 9100ff72a..869ada12b 100644
--- a/pixman/demos/scale.c
+++ b/pixman/demos/scale.c
@@ -39,6 +39,7 @@ typedef struct
     GtkAdjustment *     scale_x_adjustment;
     GtkAdjustment *     scale_y_adjustment;
     GtkAdjustment *     rotate_adjustment;
+    GtkAdjustment *	subsample_adjustment;
     int                 scaled_width;
     int                 scaled_height;
 } app_t;
@@ -236,7 +237,8 @@ rescale (GtkWidget *may_be_null, app_t *app)
 	get_value (app, filters, "reconstruct_y_combo_box"),
 	get_value (app, filters, "sample_x_combo_box"),
 	get_value (app, filters, "sample_y_combo_box"),
-        4, 4);
+	gtk_adjustment_get_value (app->subsample_adjustment),
+	gtk_adjustment_get_value (app->subsample_adjustment));
 
     pixman_image_set_filter (app->original, PIXMAN_FILTER_SEPARABLE_CONVOLUTION, params, n_params);
 
@@ -360,10 +362,13 @@ app_new (pixman_image_t *original)
         GTK_ADJUSTMENT (gtk_builder_get_object (app->builder, "scale_y_adjustment"));
     app->rotate_adjustment =
         GTK_ADJUSTMENT (gtk_builder_get_object (app->builder, "rotate_adjustment"));
+    app->subsample_adjustment =
+	GTK_ADJUSTMENT (gtk_builder_get_object (app->builder, "subsample_adjustment"));
 
     g_signal_connect (app->scale_x_adjustment, "value_changed", G_CALLBACK (rescale), app);
     g_signal_connect (app->scale_y_adjustment, "value_changed", G_CALLBACK (rescale), app);
     g_signal_connect (app->rotate_adjustment, "value_changed", G_CALLBACK (rescale), app);
+    g_signal_connect (app->subsample_adjustment, "value_changed", G_CALLBACK (rescale), app);
     
     widget = get_widget (app, "scale_x_scale");
     gtk_scale_add_mark (GTK_SCALE (widget), 0.0, GTK_POS_LEFT, NULL);
diff --git a/pixman/demos/scale.ui b/pixman/demos/scale.ui
index f7c0c805f..b3450d34d 100644
--- a/pixman/demos/scale.ui
+++ b/pixman/demos/scale.ui
@@ -23,6 +23,14 @@
     <property name="page_increment">10</property>
     <property name="page_size">10</property>
   </object>
+  <object class="GtkAdjustment" id="subsample_adjustment">
+    <property name="lower">1</property>
+    <property name="upper">12</property>
+    <property name="step_increment">1</property>
+    <property name="page_increment">1</property>
+    <property name="page_size">0</property>
+    <property name="value">4</property>
+  </object>
   <object class="GtkWindow" id="main">
     <child>
       <object class="GtkHBox" id="u">
@@ -51,6 +59,7 @@
         <child>
           <object class="GtkVBox" id="box1">
             <property name="visible">True</property>
+	    <property name="spacing">12</property>
             <child>
               <object class="GtkHBox" id="box2">
                 <property name="visible">True</property>
@@ -234,6 +243,17 @@
                       </packing>
                     </child>
                     <child>
+                      <object class="GtkLabel" id="label9">
+                        <property name="visible">True</property>
+                        <property name="xalign">1</property>
+                        <property name="label" translatable="yes">&lt;b&gt;Subsample:&lt;/b&gt;</property>
+                        <property name="use_markup">True</property>
+                      </object>
+                      <packing>
+                        <property name="top_attach">5</property>
+                      </packing>
+                    </child>
+                    <child>
                       <object class="GtkComboBox" id="reconstruct_x_combo_box">
                         <property name="visible">True</property>
                       </object>
@@ -277,6 +297,16 @@
                         <property name="top_attach">4</property>
                       </packing>
                     </child>
+                    <child>
+                      <object class="GtkSpinButton" id="subsample_spin_button">
+                        <property name="visible">True</property>
+			<property name="adjustment">subsample_adjustment</property>
+                      </object>
+                      <packing>
+                        <property name="left_attach">1</property>
+                        <property name="top_attach">5</property>
+                      </packing>
+                    </child>
                   </object>
                   <packing>
                     <property name="expand">False</property>
diff --git a/pixman/pixman/pixman-fast-path.c b/pixman/pixman/pixman-fast-path.c
index c625e0c4a..247aea645 100644
--- a/pixman/pixman/pixman-fast-path.c
+++ b/pixman/pixman/pixman-fast-path.c
@@ -739,36 +739,6 @@ fast_composite_over_8888_0565 (pixman_implementation_t *imp,
 }
 
 static void
-fast_composite_src_x888_0565 (pixman_implementation_t *imp,
-                              pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint16_t    *dst_line, *dst;
-    uint32_t    *src_line, *src, s;
-    int dst_stride, src_stride;
-    int32_t w;
-
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	{
-	    s = *src++;
-	    *dst = convert_8888_to_0565 (s);
-	    dst++;
-	}
-    }
-}
-
-static void
 fast_composite_add_8_8 (pixman_implementation_t *imp,
 			pixman_composite_info_t *info)
 {
@@ -1243,6 +1213,18 @@ fast_composite_tiled_repeat (pixman_implementation_t *imp,
     pixman_composite_func_t func;
     pixman_format_code_t mask_format;
     uint32_t src_flags, mask_flags;
+    int32_t sx, sy;
+    int32_t width_remain;
+    int32_t num_pixels;
+    int32_t src_width;
+    int32_t i, j;
+    pixman_image_t extended_src_image;
+    uint32_t extended_src[REPEAT_MIN_WIDTH * 2];
+    pixman_bool_t need_src_extension;
+    uint32_t *src_line;
+    int32_t src_stride;
+    int32_t src_bpp;
+    pixman_composite_info_t info2 = *info;
 
     src_flags = (info->src_flags & ~FAST_PATH_NORMAL_REPEAT) |
 		    FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
@@ -1258,149 +1240,131 @@ fast_composite_tiled_repeat (pixman_implementation_t *imp,
 	mask_flags = FAST_PATH_IS_OPAQUE;
     }
 
-    if (_pixman_implementation_lookup_composite (
-	    imp->toplevel, info->op,
-	    src_image->common.extended_format_code, src_flags,
-	    mask_format, mask_flags,
-	    dest_image->common.extended_format_code, info->dest_flags,
-	    &imp, &func))
+    _pixman_implementation_lookup_composite (
+	imp->toplevel, info->op,
+	src_image->common.extended_format_code, src_flags,
+	mask_format, mask_flags,
+	dest_image->common.extended_format_code, info->dest_flags,
+	&imp, &func);
+
+    src_bpp = PIXMAN_FORMAT_BPP (src_image->bits.format);
+
+    if (src_image->bits.width < REPEAT_MIN_WIDTH		&&
+	(src_bpp == 32 || src_bpp == 16 || src_bpp == 8)	&&
+	!src_image->bits.indexed)
     {
-	int32_t sx, sy;
-	int32_t width_remain;
-	int32_t num_pixels;
-	int32_t src_width;
-	int32_t i, j;
-	pixman_image_t extended_src_image;
-	uint32_t extended_src[REPEAT_MIN_WIDTH * 2];
-	pixman_bool_t need_src_extension;
-	uint32_t *src_line;
-	int32_t src_stride;
-	int32_t src_bpp;
-	pixman_composite_info_t info2 = *info;
-
-	src_bpp = PIXMAN_FORMAT_BPP (src_image->bits.format);
-
-	if (src_image->bits.width < REPEAT_MIN_WIDTH		&&
-	    (src_bpp == 32 || src_bpp == 16 || src_bpp == 8)	&&
-	    !src_image->bits.indexed)
-	{
-	    sx = src_x;
-	    sx = MOD (sx, src_image->bits.width);
-	    sx += width;
-	    src_width = 0;
+	sx = src_x;
+	sx = MOD (sx, src_image->bits.width);
+	sx += width;
+	src_width = 0;
 
-	    while (src_width < REPEAT_MIN_WIDTH && src_width <= sx)
-		src_width += src_image->bits.width;
+	while (src_width < REPEAT_MIN_WIDTH && src_width <= sx)
+	    src_width += src_image->bits.width;
 
-	    src_stride = (src_width * (src_bpp >> 3) + 3) / (int) sizeof (uint32_t);
+	src_stride = (src_width * (src_bpp >> 3) + 3) / (int) sizeof (uint32_t);
 
-	    /* Initialize/validate stack-allocated temporary image */
-	    _pixman_bits_image_init (&extended_src_image, src_image->bits.format,
-				     src_width, 1, &extended_src[0], src_stride,
-				     FALSE);
-	    _pixman_image_validate (&extended_src_image);
+	/* Initialize/validate stack-allocated temporary image */
+	_pixman_bits_image_init (&extended_src_image, src_image->bits.format,
+				 src_width, 1, &extended_src[0], src_stride,
+				 FALSE);
+	_pixman_image_validate (&extended_src_image);
 
-	    info2.src_image = &extended_src_image;
-	    need_src_extension = TRUE;
-	}
-	else
-	{
-	    src_width = src_image->bits.width;
-	    need_src_extension = FALSE;
-	}
+	info2.src_image = &extended_src_image;
+	need_src_extension = TRUE;
+    }
+    else
+    {
+	src_width = src_image->bits.width;
+	need_src_extension = FALSE;
+    }
 
-	sx = src_x;
-	sy = src_y;
+    sx = src_x;
+    sy = src_y;
 
-	while (--height >= 0)
-	{
-	    sx = MOD (sx, src_width);
-	    sy = MOD (sy, src_image->bits.height);
+    while (--height >= 0)
+    {
+	sx = MOD (sx, src_width);
+	sy = MOD (sy, src_image->bits.height);
 
-	    if (need_src_extension)
+	if (need_src_extension)
+	{
+	    if (src_bpp == 32)
 	    {
-		if (src_bpp == 32)
-		{
-		    PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint32_t, src_stride, src_line, 1);
+		PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint32_t, src_stride, src_line, 1);
 
-		    for (i = 0; i < src_width; )
-		    {
-			for (j = 0; j < src_image->bits.width; j++, i++)
-			    extended_src[i] = src_line[j];
-		    }
-		}
-		else if (src_bpp == 16)
+		for (i = 0; i < src_width; )
 		{
-		    uint16_t *src_line_16;
-
-		    PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint16_t, src_stride,
-					   src_line_16, 1);
-		    src_line = (uint32_t*)src_line_16;
-
-		    for (i = 0; i < src_width; )
-		    {
-			for (j = 0; j < src_image->bits.width; j++, i++)
-			    ((uint16_t*)extended_src)[i] = ((uint16_t*)src_line)[j];
-		    }
+		    for (j = 0; j < src_image->bits.width; j++, i++)
+			extended_src[i] = src_line[j];
 		}
-		else if (src_bpp == 8)
-		{
-		    uint8_t *src_line_8;
+	    }
+	    else if (src_bpp == 16)
+	    {
+		uint16_t *src_line_16;
 
-		    PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint8_t, src_stride,
-					   src_line_8, 1);
-		    src_line = (uint32_t*)src_line_8;
+		PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint16_t, src_stride,
+				       src_line_16, 1);
+		src_line = (uint32_t*)src_line_16;
 
-		    for (i = 0; i < src_width; )
-		    {
-			for (j = 0; j < src_image->bits.width; j++, i++)
-			    ((uint8_t*)extended_src)[i] = ((uint8_t*)src_line)[j];
-		    }
+		for (i = 0; i < src_width; )
+		{
+		    for (j = 0; j < src_image->bits.width; j++, i++)
+			((uint16_t*)extended_src)[i] = ((uint16_t*)src_line)[j];
 		}
-
-		info2.src_y = 0;
 	    }
-	    else
+	    else if (src_bpp == 8)
 	    {
-		info2.src_y = sy;
+		uint8_t *src_line_8;
+
+		PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint8_t, src_stride,
+				       src_line_8, 1);
+		src_line = (uint32_t*)src_line_8;
+
+		for (i = 0; i < src_width; )
+		{
+		    for (j = 0; j < src_image->bits.width; j++, i++)
+			((uint8_t*)extended_src)[i] = ((uint8_t*)src_line)[j];
+		}
 	    }
 
-	    width_remain = width;
+	    info2.src_y = 0;
+	}
+	else
+	{
+	    info2.src_y = sy;
+	}
 
-	    while (width_remain > 0)
-	    {
-		num_pixels = src_width - sx;
+	width_remain = width;
 
-		if (num_pixels > width_remain)
-		    num_pixels = width_remain;
+	while (width_remain > 0)
+	{
+	    num_pixels = src_width - sx;
 
-		info2.src_x = sx;
-		info2.width = num_pixels;
-		info2.height = 1;
+	    if (num_pixels > width_remain)
+		num_pixels = width_remain;
 
-		func (imp, &info2);
+	    info2.src_x = sx;
+	    info2.width = num_pixels;
+	    info2.height = 1;
 
-		width_remain -= num_pixels;
-		info2.mask_x += num_pixels;
-		info2.dest_x += num_pixels;
-		sx = 0;
-	    }
+	    func (imp, &info2);
 
-	    sx = src_x;
-	    sy++;
-	    info2.mask_x = info->mask_x;
-	    info2.mask_y++;
-	    info2.dest_x = info->dest_x;
-	    info2.dest_y++;
+	    width_remain -= num_pixels;
+	    info2.mask_x += num_pixels;
+	    info2.dest_x += num_pixels;
+	    sx = 0;
 	}
 
-	if (need_src_extension)
-	    _pixman_image_fini (&extended_src_image);
-    }
-    else
-    {
-	_pixman_log_error (FUNC, "Didn't find a suitable function ");
+	sx = src_x;
+	sy++;
+	info2.mask_x = info->mask_x;
+	info2.mask_y++;
+	info2.dest_x = info->dest_x;
+	info2.dest_y++;
     }
+
+    if (need_src_extension)
+	_pixman_image_fini (&extended_src_image);
 }
 
 /* Use more unrolling for src_0565_0565 because it is typically CPU bound */
@@ -1913,10 +1877,6 @@ static const pixman_fast_path_t c_fast_paths[] =
     PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
     PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
     PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
-    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
-    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
-    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
 
@@ -2199,12 +2159,200 @@ fast_path_fill (pixman_implementation_t *imp,
     return TRUE;
 }
 
+/*****************************************************************************/
+
+static uint32_t *
+fast_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int32_t w = iter->width;
+    uint32_t *dst = iter->buffer;
+    const uint16_t *src = (const uint16_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    /* Align the source buffer at 4 bytes boundary */
+    if (w > 0 && ((uintptr_t)src & 3))
+    {
+	*dst++ = convert_0565_to_8888 (*src++);
+	w--;
+    }
+    /* Process two pixels per iteration */
+    while ((w -= 2) >= 0)
+    {
+	uint32_t sr, sb, sg, t0, t1;
+	uint32_t s = *(const uint32_t *)src;
+	src += 2;
+	sr = (s >> 8) & 0x00F800F8;
+	sb = (s << 3) & 0x00F800F8;
+	sg = (s >> 3) & 0x00FC00FC;
+	sr |= sr >> 5;
+	sb |= sb >> 5;
+	sg |= sg >> 6;
+	t0 = ((sr << 16) & 0x00FF0000) | ((sg << 8) & 0x0000FF00) |
+	     (sb & 0xFF) | 0xFF000000;
+	t1 = (sr & 0x00FF0000) | ((sg >> 8) & 0x0000FF00) |
+	     (sb >> 16) | 0xFF000000;
+#ifdef WORDS_BIGENDIAN
+	*dst++ = t1;
+	*dst++ = t0;
+#else
+	*dst++ = t0;
+	*dst++ = t1;
+#endif
+    }
+    if (w & 1)
+    {
+	*dst = convert_0565_to_8888 (*src);
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+fast_dest_fetch_noop (pixman_iter_t *iter, const uint32_t *mask)
+{
+    iter->bits += iter->stride;
+    return iter->buffer;
+}
+
+/* Helper function for a workaround, which tries to ensure that 0x1F001F
+ * constant is always allocated in a register on RISC architectures.
+ */
+static force_inline uint32_t
+convert_8888_to_0565_workaround (uint32_t s, uint32_t x1F001F)
+{
+    uint32_t a, b;
+    a = (s >> 3) & x1F001F;
+    b = s & 0xFC00;
+    a |= a >> 5;
+    a |= b >> 5;
+    return a;
+}
+
+static void
+fast_write_back_r5g6b5 (pixman_iter_t *iter)
+{
+    int32_t w = iter->width;
+    uint16_t *dst = (uint16_t *)(iter->bits - iter->stride);
+    const uint32_t *src = iter->buffer;
+    /* Workaround to ensure that x1F001F variable is allocated in a register */
+    static volatile uint32_t volatile_x1F001F = 0x1F001F;
+    uint32_t x1F001F = volatile_x1F001F;
+
+    while ((w -= 4) >= 0)
+    {
+	uint32_t s1 = *src++;
+	uint32_t s2 = *src++;
+	uint32_t s3 = *src++;
+	uint32_t s4 = *src++;
+	*dst++ = convert_8888_to_0565_workaround (s1, x1F001F);
+	*dst++ = convert_8888_to_0565_workaround (s2, x1F001F);
+	*dst++ = convert_8888_to_0565_workaround (s3, x1F001F);
+	*dst++ = convert_8888_to_0565_workaround (s4, x1F001F);
+    }
+    if (w & 2)
+    {
+	*dst++ = convert_8888_to_0565_workaround (*src++, x1F001F);
+	*dst++ = convert_8888_to_0565_workaround (*src++, x1F001F);
+    }
+    if (w & 1)
+    {
+	*dst = convert_8888_to_0565_workaround (*src, x1F001F);
+    }
+}
+
+typedef struct
+{
+    pixman_format_code_t	format;
+    pixman_iter_get_scanline_t	get_scanline;
+    pixman_iter_write_back_t	write_back;
+} fetcher_info_t;
+
+static const fetcher_info_t fetchers[] =
+{
+    { PIXMAN_r5g6b5, fast_fetch_r5g6b5, fast_write_back_r5g6b5 },
+    { PIXMAN_null }
+};
+
+static pixman_bool_t
+fast_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+
+#define FLAGS								\
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
+     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+
+    if ((iter->iter_flags & ITER_NARROW)			&&
+	(iter->image_flags & FLAGS) == FLAGS)
+    {
+	const fetcher_info_t *f;
+
+	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+	{
+	    if (image->common.extended_format_code == f->format)
+	    {
+		uint8_t *b = (uint8_t *)image->bits.bits;
+		int s = image->bits.rowstride * 4;
+
+		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
+		iter->stride = s;
+
+		iter->get_scanline = f->get_scanline;
+		return TRUE;
+	    }
+	}
+    }
+
+    return FALSE;
+}
+
+static pixman_bool_t
+fast_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+
+    if ((iter->iter_flags & ITER_NARROW)		&&
+	(iter->image_flags & FAST_PATH_STD_DEST_FLAGS) == FAST_PATH_STD_DEST_FLAGS)
+    {
+	const fetcher_info_t *f;
+
+	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+	{
+	    if (image->common.extended_format_code == f->format)
+	    {
+		uint8_t *b = (uint8_t *)image->bits.bits;
+		int s = image->bits.rowstride * 4;
+
+		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
+		iter->stride = s;
+
+		if ((iter->iter_flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) ==
+		    (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA))
+		{
+		    iter->get_scanline = fast_dest_fetch_noop;
+		}
+		else
+		{
+		    iter->get_scanline = f->get_scanline;
+		}
+		iter->write_back = f->write_back;
+		return TRUE;
+	    }
+	}
+    }
+    return FALSE;
+}
+
+
 pixman_implementation_t *
 _pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
 {
     pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
 
     imp->fill = fast_path_fill;
+    imp->src_iter_init = fast_src_iter_init;
+    imp->dest_iter_init = fast_dest_iter_init;
 
     return imp;
 }
diff --git a/pixman/pixman/pixman-general.c b/pixman/pixman/pixman-general.c
index f175d771e..93a1b9acf 100644
--- a/pixman/pixman/pixman-general.c
+++ b/pixman/pixman/pixman-general.c
@@ -188,9 +188,6 @@ general_composite_rect  (pixman_implementation_t *imp,
     compose = _pixman_implementation_lookup_combiner (
 	imp->toplevel, op, component_alpha, narrow);
 
-    if (!compose)
-	return;
-
     for (i = 0; i < height; ++i)
     {
 	uint32_t *s, *m, *d;
diff --git a/pixman/pixman/pixman-glyph.c b/pixman/pixman/pixman-glyph.c
index 6d2c8bbb7..5a271b64b 100644
--- a/pixman/pixman/pixman-glyph.c
+++ b/pixman/pixman/pixman-glyph.c
@@ -463,16 +463,13 @@ pixman_composite_glyphs_no_mask (pixman_op_t            op,
 		{
 		    glyph_format = glyph_img->common.extended_format_code;
 		    glyph_flags = glyph_img->common.flags;
-		    
+
 		    _pixman_implementation_lookup_composite (
 			get_implementation(), op,
 			src->common.extended_format_code, src->common.flags,
 			glyph_format, glyph_flags | extra,
 			dest_format, dest_flags,
 			&implementation, &func);
-
-		    if (!func)
-			goto out;
 		}
 
 		info.src_x = src_x + composite_box.x1 - dest_x;
@@ -582,9 +579,6 @@ add_glyphs (pixman_glyph_cache_t *cache,
 		mask_format, info.mask_flags,
 		dest_format, dest_flags,
 		&implementation, &func);
-
-	    if (!func)
-		goto out;
 	}
 
 	glyph_box.x1 = glyphs[i].x - glyph->origin_x + off_x;
diff --git a/pixman/pixman/pixman-implementation.c b/pixman/pixman/pixman-implementation.c
index ec467a619..c0a643633 100644
--- a/pixman/pixman/pixman-implementation.c
+++ b/pixman/pixman/pixman-implementation.c
@@ -65,7 +65,13 @@ typedef struct
 
 PIXMAN_DEFINE_THREAD_LOCAL (cache_t, fast_path_cache);
 
-pixman_bool_t
+static void
+dummy_composite_rect (pixman_implementation_t *imp,
+		      pixman_composite_info_t *info)
+{
+}
+
+void
 _pixman_implementation_lookup_composite (pixman_implementation_t  *toplevel,
 					 pixman_op_t               op,
 					 pixman_format_code_t      src_format,
@@ -142,7 +148,11 @@ _pixman_implementation_lookup_composite (pixman_implementation_t  *toplevel,
 	    ++info;
 	}
     }
-    return FALSE;
+
+    /* We should never reach this point */
+    _pixman_log_error (FUNC, "No known composite function\n");
+    *out_imp = NULL;
+    *out_func = dummy_composite_rect;
 
 update_cache:
     if (i)
@@ -160,8 +170,16 @@ update_cache:
 	cache->cache[0].fast_path.dest_flags = dest_flags;
 	cache->cache[0].fast_path.func = *out_func;
     }
+}
 
-    return TRUE;
+static void
+dummy_combine (pixman_implementation_t *imp,
+	       pixman_op_t              op,
+	       uint32_t *               pd,
+	       const uint32_t *         ps,
+	       const uint32_t *         pm,
+	       int                      w)
+{
 }
 
 pixman_combine_32_func_t
@@ -199,7 +217,9 @@ _pixman_implementation_lookup_combiner (pixman_implementation_t *imp,
 	imp = imp->fallback;
     }
 
-    return NULL;
+    /* We should never reach this point */
+    _pixman_log_error (FUNC, "No known combine function\n");
+    return dummy_combine;
 }
 
 pixman_bool_t
diff --git a/pixman/pixman/pixman-inlines.h b/pixman/pixman/pixman-inlines.h
index ab4def0dc..dd1c2f17f 100644
--- a/pixman/pixman/pixman-inlines.h
+++ b/pixman/pixman/pixman-inlines.h
@@ -88,6 +88,42 @@ pixman_fixed_to_bilinear_weight (pixman_fixed_t x)
 	   ((1 << BILINEAR_INTERPOLATION_BITS) - 1);
 }
 
+#if BILINEAR_INTERPOLATION_BITS <= 4
+/* Inspired by Filter_32_opaque from Skia */
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+			uint32_t bl, uint32_t br,
+			int distx, int disty)
+{
+    int distxy, distxiy, distixy, distixiy;
+    uint32_t lo, hi;
+
+    distx <<= (4 - BILINEAR_INTERPOLATION_BITS);
+    disty <<= (4 - BILINEAR_INTERPOLATION_BITS);
+
+    distxy = distx * disty;
+    distxiy = (distx << 4) - distxy;	/* distx * (16 - disty) */
+    distixy = (disty << 4) - distxy;	/* disty * (16 - distx) */
+    distixiy =
+	16 * 16 - (disty << 4) -
+	(distx << 4) + distxy; /* (16 - distx) * (16 - disty) */
+
+    lo = (tl & 0xff00ff) * distixiy;
+    hi = ((tl >> 8) & 0xff00ff) * distixiy;
+
+    lo += (tr & 0xff00ff) * distxiy;
+    hi += ((tr >> 8) & 0xff00ff) * distxiy;
+
+    lo += (bl & 0xff00ff) * distixy;
+    hi += ((bl >> 8) & 0xff00ff) * distixy;
+
+    lo += (br & 0xff00ff) * distxy;
+    hi += ((br >> 8) & 0xff00ff) * distxy;
+
+    return ((lo >> 8) & 0xff00ff) | (hi & ~0xff00ff);
+}
+
+#else
 #if SIZEOF_LONG > 4
 
 static force_inline uint32_t
@@ -184,6 +220,7 @@ bilinear_interpolation (uint32_t tl, uint32_t tr,
 }
 
 #endif
+#endif // BILINEAR_INTERPOLATION_BITS <= 4
 
 /*
  * For each scanline fetched from source image with PAD repeat:
diff --git a/pixman/pixman/pixman-matrix.c b/pixman/pixman/pixman-matrix.c
index cd2f1b5b8..89b96826b 100644
--- a/pixman/pixman/pixman-matrix.c
+++ b/pixman/pixman/pixman-matrix.c
@@ -34,6 +34,338 @@
 
 #define F(x)    pixman_int_to_fixed (x)
 
+static force_inline int
+count_leading_zeros (uint32_t x)
+{
+#ifdef __GNUC__
+    return __builtin_clz (x);
+#else
+    int n = 0;
+    while (x)
+    {
+        n++;
+        x >>= 1;
+    }
+    return 32 - n;
+#endif
+}
+
+/*
+ * Large signed/unsigned integer division with rounding for the platforms with
+ * only 64-bit integer data type supported (no 128-bit data type).
+ *
+ * Arguments:
+ *     hi, lo - high and low 64-bit parts of the dividend
+ *     div    - 48-bit divisor
+ *
+ * Returns: lowest 64 bits of the result as a return value and highest 64
+ *          bits of the result to "result_hi" pointer
+ */
+
+/* grade-school unsigned division (128-bit by 48-bit) with rounding to nearest */
+static force_inline uint64_t
+rounded_udiv_128_by_48 (uint64_t  hi,
+                        uint64_t  lo,
+                        uint64_t  div,
+                        uint64_t *result_hi)
+{
+    uint64_t tmp, remainder, result_lo;
+    assert(div < ((uint64_t)1 << 48));
+
+    remainder = hi % div;
+    *result_hi = hi / div;
+
+    tmp = (remainder << 16) + (lo >> 48);
+    result_lo = tmp / div;
+    remainder = tmp % div;
+
+    tmp = (remainder << 16) + ((lo >> 32) & 0xFFFF);
+    result_lo = (result_lo << 16) + (tmp / div);
+    remainder = tmp % div;
+
+    tmp = (remainder << 16) + ((lo >> 16) & 0xFFFF);
+    result_lo = (result_lo << 16) + (tmp / div);
+    remainder = tmp % div;
+
+    tmp = (remainder << 16) + (lo & 0xFFFF);
+    result_lo = (result_lo << 16) + (tmp / div);
+    remainder = tmp % div;
+
+    /* round to nearest */
+    if (remainder * 2 >= div && ++result_lo == 0)
+        *result_hi += 1;
+
+    return result_lo;
+}
+
+/* signed division (128-bit by 49-bit) with rounding to nearest */
+static inline int64_t
+rounded_sdiv_128_by_49 (int64_t   hi,
+                        uint64_t  lo,
+                        int64_t   div,
+                        int64_t  *signed_result_hi)
+{
+    uint64_t result_lo, result_hi;
+    int sign = 0;
+    if (div < 0)
+    {
+        div = -div;
+        sign ^= 1;
+    }
+    if (hi < 0)
+    {
+        if (lo != 0)
+            hi++;
+        hi = -hi;
+        lo = -lo;
+        sign ^= 1;
+    }
+    result_lo = rounded_udiv_128_by_48 (hi, lo, div, &result_hi);
+    if (sign)
+    {
+        if (result_lo != 0)
+            result_hi++;
+        result_hi = -result_hi;
+        result_lo = -result_lo;
+    }
+    if (signed_result_hi)
+    {
+        *signed_result_hi = result_hi;
+    }
+    return result_lo;
+}
+
+/*
+ * Multiply 64.16 fixed point value by (2^scalebits) and convert
+ * to 128-bit integer.
+ */
+static force_inline void
+fixed_64_16_to_int128 (int64_t  hi,
+                       int64_t  lo,
+                       int64_t *rhi,
+                       int64_t *rlo,
+                       int      scalebits)
+{
+    /* separate integer and fractional parts */
+    hi += lo >> 16;
+    lo &= 0xFFFF;
+
+    if (scalebits <= 0)
+    {
+        *rlo = hi >> (-scalebits);
+        *rhi = *rlo >> 63;
+    }
+    else
+    {
+        *rhi = hi >> (64 - scalebits);
+        *rlo = (uint64_t)hi << scalebits;
+        if (scalebits < 16)
+            *rlo += lo >> (16 - scalebits);
+        else
+            *rlo += lo << (scalebits - 16);
+    }
+}
+
+/*
+ * Convert 112.16 fixed point value to 48.16 with clamping for the out
+ * of range values.
+ */
+static force_inline pixman_fixed_48_16_t
+fixed_112_16_to_fixed_48_16 (int64_t hi, int64_t lo, pixman_bool_t *clampflag)
+{
+    if ((lo >> 63) != hi)
+    {
+        *clampflag = TRUE;
+        return hi >= 0 ? INT64_MAX : INT64_MIN;
+    }
+    else
+    {
+        return lo;
+    }
+}
+
+/*
+ * Transform a point with 31.16 fixed point coordinates from the destination
+ * space to a point with 48.16 fixed point coordinates in the source space.
+ * No overflows are possible for affine transformations and the results are
+ * accurate including the least significant bit. Projective transformations
+ * may overflow, in this case the results are just clamped to return maximum
+ * or minimum 48.16 values (so that the caller can at least handle the NONE
+ * and PAD repeats correctly) and the return value is FALSE to indicate that
+ * such clamping has happened.
+ */
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_point_31_16 (const pixman_transform_t    *t,
+                              const pixman_vector_48_16_t *v,
+                              pixman_vector_48_16_t       *result)
+{
+    pixman_bool_t clampflag = FALSE;
+    int i;
+    int64_t tmp[3][2], divint;
+    uint16_t divfrac;
+
+    /* input vector values must have no more than 31 bits (including sign)
+     * in the integer part */
+    assert (v->v[0] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+
+    for (i = 0; i < 3; i++)
+    {
+        tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16);
+        tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF);
+    }
+
+    /*
+     * separate 64-bit integer and 16-bit fractional parts for the divisor,
+     * which is also scaled by 65536 after fixed point multiplication.
+     */
+    divint  = tmp[2][0] + (tmp[2][1] >> 16);
+    divfrac = tmp[2][1] & 0xFFFF;
+
+    if (divint == pixman_fixed_1 && divfrac == 0)
+    {
+        /*
+         * this is a simple affine transformation
+         */
+        result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16);
+        result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16);
+        result->v[2] = pixman_fixed_1;
+    }
+    else if (divint == 0 && divfrac == 0)
+    {
+        /*
+         * handle zero divisor (if the values are non-zero, set the
+         * results to maximum positive or minimum negative)
+         */
+        clampflag = TRUE;
+
+        result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16);
+        result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16);
+
+        if (result->v[0] > 0)
+            result->v[0] = INT64_MAX;
+        else if (result->v[0] < 0)
+            result->v[0] = INT64_MIN;
+
+        if (result->v[1] > 0)
+            result->v[1] = INT64_MAX;
+        else if (result->v[1] < 0)
+            result->v[1] = INT64_MIN;
+    }
+    else
+    {
+        /*
+         * projective transformation, analyze the top 32 bits of the divisor
+         */
+        int32_t hi32divbits = divint >> 32;
+        if (hi32divbits < 0)
+            hi32divbits = ~hi32divbits;
+
+        if (hi32divbits == 0)
+        {
+            /* the divisor is small, we can actually keep all the bits */
+            int64_t hi, rhi, lo, rlo;
+            int64_t div = (divint << 16) + divfrac;
+
+            fixed_64_16_to_int128 (tmp[0][0], tmp[0][1], &hi, &lo, 32);
+            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
+            result->v[0] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag);
+
+            fixed_64_16_to_int128 (tmp[1][0], tmp[1][1], &hi, &lo, 32);
+            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
+            result->v[1] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag);
+        }
+        else
+        {
+            /* the divisor needs to be reduced to 48 bits */
+            int64_t hi, rhi, lo, rlo, div;
+            int shift = 32 - count_leading_zeros (hi32divbits);
+            fixed_64_16_to_int128 (divint, divfrac, &hi, &div, 16 - shift);
+
+            fixed_64_16_to_int128 (tmp[0][0], tmp[0][1], &hi, &lo, 32 - shift);
+            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
+            result->v[0] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag);
+
+            fixed_64_16_to_int128 (tmp[1][0], tmp[1][1], &hi, &lo, 32 - shift);
+            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
+            result->v[1] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag);
+        }
+    }
+    result->v[2] = pixman_fixed_1;
+    return !clampflag;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_point_31_16_affine (const pixman_transform_t    *t,
+                                     const pixman_vector_48_16_t *v,
+                                     pixman_vector_48_16_t       *result)
+{
+    int64_t hi0, lo0, hi1, lo1;
+
+    /* input vector values must have no more than 31 bits (including sign)
+     * in the integer part */
+    assert (v->v[0] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+
+    hi0  = (int64_t)t->matrix[0][0] * (v->v[0] >> 16);
+    lo0  = (int64_t)t->matrix[0][0] * (v->v[0] & 0xFFFF);
+    hi0 += (int64_t)t->matrix[0][1] * (v->v[1] >> 16);
+    lo0 += (int64_t)t->matrix[0][1] * (v->v[1] & 0xFFFF);
+    hi0 += (int64_t)t->matrix[0][2];
+
+    hi1  = (int64_t)t->matrix[1][0] * (v->v[0] >> 16);
+    lo1  = (int64_t)t->matrix[1][0] * (v->v[0] & 0xFFFF);
+    hi1 += (int64_t)t->matrix[1][1] * (v->v[1] >> 16);
+    lo1 += (int64_t)t->matrix[1][1] * (v->v[1] & 0xFFFF);
+    hi1 += (int64_t)t->matrix[1][2];
+
+    result->v[0] = hi0 + ((lo0 + 0x8000) >> 16);
+    result->v[1] = hi1 + ((lo1 + 0x8000) >> 16);
+    result->v[2] = pixman_fixed_1;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_point_31_16_3d (const pixman_transform_t    *t,
+                                 const pixman_vector_48_16_t *v,
+                                 pixman_vector_48_16_t       *result)
+{
+    int i;
+    int64_t tmp[3][2];
+
+    /* input vector values must have no more than 31 bits (including sign)
+     * in the integer part */
+    assert (v->v[0] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+
+    for (i = 0; i < 3; i++)
+    {
+        tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16);
+        tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF);
+    }
+
+    result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16);
+    result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16);
+    result->v[2] = tmp[2][0] + ((tmp[2][1] + 0x8000) >> 16);
+}
+
 PIXMAN_EXPORT void
 pixman_transform_init_identity (struct pixman_transform *matrix)
 {
@@ -50,69 +382,41 @@ PIXMAN_EXPORT pixman_bool_t
 pixman_transform_point_3d (const struct pixman_transform *transform,
                            struct pixman_vector *         vector)
 {
-    struct pixman_vector result;
-    pixman_fixed_32_32_t partial;
-    pixman_fixed_48_16_t v;
-    int i, j;
+    pixman_vector_48_16_t tmp;
+    tmp.v[0] = vector->vector[0];
+    tmp.v[1] = vector->vector[1];
+    tmp.v[2] = vector->vector[2];
 
-    for (j = 0; j < 3; j++)
-    {
-	v = 0;
-	for (i = 0; i < 3; i++)
-	{
-	    partial = ((pixman_fixed_48_16_t) transform->matrix[j][i] *
-	               (pixman_fixed_48_16_t) vector->vector[i]);
-	    v += (partial + 0x8000) >> 16;
-	}
-	
-	if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16)
-	    return FALSE;
-	
-	result.vector[j] = (pixman_fixed_t) v;
-    }
-    
-    *vector = result;
+    pixman_transform_point_31_16_3d (transform, &tmp, &tmp);
 
-    if (!result.vector[2])
-	return FALSE;
+    vector->vector[0] = tmp.v[0];
+    vector->vector[1] = tmp.v[1];
+    vector->vector[2] = tmp.v[2];
 
-    return TRUE;
+    return vector->vector[0] == tmp.v[0] &&
+           vector->vector[1] == tmp.v[1] &&
+           vector->vector[2] == tmp.v[2];
 }
 
 PIXMAN_EXPORT pixman_bool_t
 pixman_transform_point (const struct pixman_transform *transform,
                         struct pixman_vector *         vector)
 {
-    pixman_fixed_32_32_t partial;
-    pixman_fixed_34_30_t v[3];
-    pixman_fixed_48_16_t quo;
-    int i, j;
+    pixman_vector_48_16_t tmp;
+    tmp.v[0] = vector->vector[0];
+    tmp.v[1] = vector->vector[1];
+    tmp.v[2] = vector->vector[2];
 
-    for (j = 0; j < 3; j++)
-    {
-	v[j] = 0;
-	
-	for (i = 0; i < 3; i++)
-	{
-	    partial = ((pixman_fixed_32_32_t) transform->matrix[j][i] *
-	               (pixman_fixed_32_32_t) vector->vector[i]);
-	    v[j] += (partial + 2) >> 2;
-	}
-    }
-    
-    if (!((v[2] + 0x8000) >> 16))
-	return FALSE;
+    if (!pixman_transform_point_31_16 (transform, &tmp, &tmp))
+        return FALSE;
 
-    for (j = 0; j < 2; j++)
-    {
-	quo = v[j] / ((v[2] + 0x8000) >> 16);
-	if (quo > pixman_max_fixed_48_16 || quo < pixman_min_fixed_48_16)
-	    return FALSE;
-	vector->vector[j] = (pixman_fixed_t) quo;
-    }
-    
-    vector->vector[2] = pixman_fixed_1;
-    return TRUE;
+    vector->vector[0] = tmp.v[0];
+    vector->vector[1] = tmp.v[1];
+    vector->vector[2] = tmp.v[2];
+
+    return vector->vector[0] == tmp.v[0] &&
+           vector->vector[1] == tmp.v[1] &&
+           vector->vector[2] == tmp.v[2];
 }
 
 PIXMAN_EXPORT pixman_bool_t
diff --git a/pixman/pixman/pixman-private.h b/pixman/pixman/pixman-private.h
index e5ab873ed..cb78a2ed8 100644
--- a/pixman/pixman/pixman-private.h
+++ b/pixman/pixman/pixman-private.h
@@ -497,7 +497,7 @@ pixman_implementation_t *
 _pixman_implementation_create (pixman_implementation_t *fallback,
 			       const pixman_fast_path_t *fast_paths);
 
-pixman_bool_t
+void
 _pixman_implementation_lookup_composite (pixman_implementation_t  *toplevel,
 					 pixman_op_t               op,
 					 pixman_format_code_t      src_format,
@@ -1052,7 +1052,7 @@ _pixman_log_error (const char *function, const char *message);
 
 #else
 
-#define _pixman_log_error(f,m) do { } while (0)				\
+#define _pixman_log_error(f,m) do { } while (0)
 
 #define return_if_fail(expr)						\
     do                                                                  \
@@ -1078,6 +1078,27 @@ _pixman_log_error (const char *function, const char *message);
 #endif
 
 /*
+ * Matrix
+ */
+
+typedef struct { pixman_fixed_48_16_t v[3]; } pixman_vector_48_16_t;
+
+pixman_bool_t
+pixman_transform_point_31_16 (const pixman_transform_t    *t,
+                              const pixman_vector_48_16_t *v,
+                              pixman_vector_48_16_t       *result);
+
+void
+pixman_transform_point_31_16_3d (const pixman_transform_t    *t,
+                                 const pixman_vector_48_16_t *v,
+                                 pixman_vector_48_16_t       *result);
+
+void
+pixman_transform_point_31_16_affine (const pixman_transform_t    *t,
+                                     const pixman_vector_48_16_t *v,
+                                     pixman_vector_48_16_t       *result);
+
+/*
  * Timers
  */
 
diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c
index 5a0e0626a..fc873cc96 100644
--- a/pixman/pixman/pixman-sse2.c
+++ b/pixman/pixman/pixman-sse2.c
@@ -4523,7 +4523,163 @@ sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
 
 	sse2_combine_add_u (imp, op, dst, src, NULL, width);
     }
+}
+
+static void
+sse2_composite_add_n_8888 (pixman_implementation_t *imp,
+			   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *dst_line, *dst, src;
+    int dst_stride;
+
+    __m128i xmm_src;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    if (src == 0)
+	return;
+
+    if (src == ~0)
+    {
+	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
+		     dest_x, dest_y, width, height, ~0);
+
+	return;
+    }
+
+    xmm_src = _mm_set_epi32 (src, src, src, src);
+    while (height--)
+    {
+	int w = width;
+	uint32_t d;
+
+	dst = dst_line;
+	dst_line += dst_stride;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    d = *dst;
+	    *dst++ =
+		_mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    save_128_aligned
+		((__m128i*)dst,
+		 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
+
+	    dst += 4;
+	    w -= 4;
+	}
+
+	while (w--)
+	{
+	    d = *dst;
+	    *dst++ =
+		_mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
+						  _mm_cvtsi32_si128 (d)));
+	}
+    }
+}
+
+static void
+sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
+			     pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+
+    __m128i xmm_src;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    if (src == 0)
+	return;
+    xmm_src = expand_pixel_32_1x128 (src);
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && ((unsigned long)dst & 15))
+	{
+	    uint8_t m = *mask++;
+	    if (m)
+	    {
+		*dst = pack_1x128_32
+		    (_mm_adds_epu16
+		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
+		      unpack_32_1x128 (*dst)));
+	    }
+	    dst++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    uint32_t m = *(uint32_t*)mask;
+	    if (m)
+	    {
+		__m128i xmm_mask_lo, xmm_mask_hi;
+		__m128i xmm_dst_lo, xmm_dst_hi;
+
+		__m128i xmm_dst = load_128_aligned ((__m128i*)dst);
+		__m128i xmm_mask =
+		    _mm_unpacklo_epi8 (unpack_32_1x128(m),
+				       _mm_setzero_si128 ());
+
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+
+		pix_multiply_2x128 (&xmm_src, &xmm_src,
+				    &xmm_mask_lo, &xmm_mask_hi,
+				    &xmm_mask_lo, &xmm_mask_hi);
+
+		xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
+		xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
 
+	    w -= 4;
+	    dst += 4;
+	    mask += 4;
+	}
+
+	while (w)
+	{
+	    uint8_t m = *mask++;
+	    if (m)
+	    {
+		*dst = pack_1x128_32
+		    (_mm_adds_epu16
+		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
+		      unpack_32_1x128 (*dst)));
+	    }
+	    dst++;
+	    w--;
+	}
+    }
 }
 
 static pixman_bool_t
@@ -5786,6 +5942,121 @@ FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
 			       uint32_t, uint8_t, uint32_t,
 			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
 
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,
+						const uint32_t * mask,
+						const uint32_t * src_top,
+						const uint32_t * src_bottom,
+						int32_t          w,
+						int              wt,
+						int              wb,
+						pixman_fixed_t   vx,
+						pixman_fixed_t   unit_x,
+						pixman_fixed_t   max_vx,
+						pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2, pix3, pix4;
+    __m128i xmm_mask;
+
+    if (zero_src || (*mask >> 24) == 0)
+	return;
+
+    xmm_mask = create_mask_16_128 (*mask >> 24);
+
+    while (w && ((uintptr_t)dst & 15))
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	if (pix1)
+	{
+		uint32_t d = *dst;
+
+		__m128i ms = unpack_32_1x128 (pix1);
+		__m128i alpha     = expand_alpha_1x128 (ms);
+		__m128i dest      = xmm_mask;
+		__m128i alpha_dst = unpack_32_1x128 (d);
+
+		*dst = pack_1x128_32
+			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+	}
+
+	dst++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+
+	if (pix1 | pix2 | pix3 | pix4)
+	{
+	    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+	    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+	    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+				&xmm_alpha_lo, &xmm_alpha_hi);
+
+	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			   &xmm_alpha_lo, &xmm_alpha_hi,
+			   &xmm_mask, &xmm_mask,
+			   &xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned
+		((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	}
+
+	dst += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	if (pix1)
+	{
+		uint32_t d = *dst;
+
+		__m128i ms = unpack_32_1x128 (pix1);
+		__m128i alpha     = expand_alpha_1x128 (ms);
+		__m128i dest      = xmm_mask;
+		__m128i alpha_dst = unpack_32_1x128 (d);
+
+		*dst = pack_1x128_32
+			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+	}
+
+	dst++;
+	w--;
+    }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
+			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_HAVE_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
+			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_HAVE_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
+			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FLAG_HAVE_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
+			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_HAVE_SOLID_MASK)
+
 static const pixman_fast_path_t sse2_fast_paths[] =
 {
     /* PIXMAN_OP_OVER */
@@ -5848,6 +6119,14 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
 
     /* PIXMAN_OP_SRC */
     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
@@ -5912,6 +6191,11 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
 
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+
     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
diff --git a/pixman/pixman/pixman.c b/pixman/pixman/pixman.c
index 3fabed161..184f0c4e6 100644
--- a/pixman/pixman/pixman.c
+++ b/pixman/pixman/pixman.c
@@ -581,11 +581,13 @@ pixman_image_composite32 (pixman_op_t      op,
                           int32_t          height)
 {
     pixman_format_code_t src_format, mask_format, dest_format;
-    uint32_t src_flags, mask_flags, dest_flags;
     pixman_region32_t region;
     pixman_box32_t extents;
     pixman_implementation_t *imp;
     pixman_composite_func_t func;
+    pixman_composite_info_t info;
+    const pixman_box32_t *pbox;
+    int n;
 
     _pixman_image_validate (src);
     if (mask)
@@ -593,27 +595,27 @@ pixman_image_composite32 (pixman_op_t      op,
     _pixman_image_validate (dest);
 
     src_format = src->common.extended_format_code;
-    src_flags = src->common.flags;
+    info.src_flags = src->common.flags;
 
     if (mask && !(mask->common.flags & FAST_PATH_IS_OPAQUE))
     {
 	mask_format = mask->common.extended_format_code;
-	mask_flags = mask->common.flags;
+	info.mask_flags = mask->common.flags;
     }
     else
     {
 	mask_format = PIXMAN_null;
-	mask_flags = FAST_PATH_IS_OPAQUE;
+	info.mask_flags = FAST_PATH_IS_OPAQUE;
     }
 
     dest_format = dest->common.extended_format_code;
-    dest_flags = dest->common.flags;
+    info.dest_flags = dest->common.flags;
 
     /* Check for pixbufs */
     if ((mask_format == PIXMAN_a8r8g8b8 || mask_format == PIXMAN_a8b8g8r8) &&
 	(src->type == BITS && src->bits.bits == mask->bits.bits)	   &&
 	(src->common.repeat == mask->common.repeat)			   &&
-	(src_flags & mask_flags & FAST_PATH_ID_TRANSFORM)		   &&
+	(info.src_flags & info.mask_flags & FAST_PATH_ID_TRANSFORM)	   &&
 	(src_x == mask_x && src_y == mask_y))
     {
 	if (src_format == PIXMAN_x8b8g8r8)
@@ -638,7 +640,7 @@ pixman_image_composite32 (pixman_op_t      op,
     extents.x2 -= dest_x - src_x;
     extents.y2 -= dest_y - src_y;
 
-    if (!analyze_extent (src, &extents, &src_flags))
+    if (!analyze_extent (src, &extents, &info.src_flags))
 	goto out;
 
     extents.x1 -= src_x - mask_x;
@@ -646,7 +648,7 @@ pixman_image_composite32 (pixman_op_t      op,
     extents.x2 -= src_x - mask_x;
     extents.y2 -= src_y - mask_y;
 
-    if (!analyze_extent (mask, &extents, &mask_flags))
+    if (!analyze_extent (mask, &extents, &info.mask_flags))
 	goto out;
 
     /* If the clip is within the source samples, and the samples are
@@ -659,16 +661,16 @@ pixman_image_composite32 (pixman_op_t      op,
 			 FAST_PATH_BILINEAR_FILTER |			\
 			 FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR)
 
-    if ((src_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE ||
-	(src_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE)
+    if ((info.src_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE ||
+	(info.src_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE)
     {
-	src_flags |= FAST_PATH_IS_OPAQUE;
+	info.src_flags |= FAST_PATH_IS_OPAQUE;
     }
 
-    if ((mask_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE ||
-	(mask_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE)
+    if ((info.mask_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE ||
+	(info.mask_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE)
     {
-	mask_flags |= FAST_PATH_IS_OPAQUE;
+	info.mask_flags |= FAST_PATH_IS_OPAQUE;
     }
 
     /*
@@ -676,42 +678,35 @@ pixman_image_composite32 (pixman_op_t      op,
      * if the src or dest are opaque. The output operator should be
      * mathematically equivalent to the source.
      */
-    op = optimize_operator (op, src_flags, mask_flags, dest_flags);
+    info.op = optimize_operator (op, info.src_flags, info.mask_flags, info.dest_flags);
 
-    if (_pixman_implementation_lookup_composite (
-	    get_implementation (), op,
-	    src_format, src_flags, mask_format, mask_flags, dest_format, dest_flags,
-	    &imp, &func))
-    {
-	pixman_composite_info_t info;
-	const pixman_box32_t *pbox;
-	int n;
+    _pixman_implementation_lookup_composite (
+	get_implementation (), info.op,
+	src_format, info.src_flags,
+	mask_format, info.mask_flags,
+	dest_format, info.dest_flags,
+	&imp, &func);
 
-	info.op = op;
-	info.src_image = src;
-	info.mask_image = mask;
-	info.dest_image = dest;
-	info.src_flags = src_flags;
-	info.mask_flags = mask_flags;
-	info.dest_flags = dest_flags;
+    info.src_image = src;
+    info.mask_image = mask;
+    info.dest_image = dest;
 
-	pbox = pixman_region32_rectangles (&region, &n);
+    pbox = pixman_region32_rectangles (&region, &n);
 
-	while (n--)
-	{
-	    info.src_x = pbox->x1 + src_x - dest_x;
-	    info.src_y = pbox->y1 + src_y - dest_y;
-	    info.mask_x = pbox->x1 + mask_x - dest_x;
-	    info.mask_y = pbox->y1 + mask_y - dest_y;
-	    info.dest_x = pbox->x1;
-	    info.dest_y = pbox->y1;
-	    info.width = pbox->x2 - pbox->x1;
-	    info.height = pbox->y2 - pbox->y1;
-
-	    func (imp, &info);
-
-	    pbox++;
-	}
+    while (n--)
+    {
+	info.src_x = pbox->x1 + src_x - dest_x;
+	info.src_y = pbox->y1 + src_y - dest_y;
+	info.mask_x = pbox->x1 + mask_x - dest_x;
+	info.mask_y = pbox->y1 + mask_y - dest_y;
+	info.dest_x = pbox->x1;
+	info.dest_y = pbox->y1;
+	info.width = pbox->x2 - pbox->x1;
+	info.height = pbox->y2 - pbox->y1;
+
+	func (imp, &info);
+
+	pbox++;
     }
 
 out:
diff --git a/pixman/test/Makefile.sources b/pixman/test/Makefile.sources
index 8c0b505df..e323a8e8c 100644
--- a/pixman/test/Makefile.sources
+++ b/pixman/test/Makefile.sources
@@ -17,6 +17,7 @@ TESTPROGRAMS =			\
 	gradient-crash-test	\
 	region-contains-test	\
 	alphamap		\
+	matrix-test		\
 	stress-test		\
 	composite-traps-test	\
 	blitters-test		\
diff --git a/pixman/test/affine-test.c b/pixman/test/affine-test.c
index 678fbe844..2506250db 100644
--- a/pixman/test/affine-test.c
+++ b/pixman/test/affine-test.c
@@ -307,11 +307,11 @@ test_composite (int      testnum,
 }
 
 #if BILINEAR_INTERPOLATION_BITS == 8
-#define CHECKSUM 0x97097336
+#define CHECKSUM 0x2CDF1F07
 #elif BILINEAR_INTERPOLATION_BITS == 7
-#define CHECKSUM 0x31D2DC21
+#define CHECKSUM 0xBC00B1DF
 #elif BILINEAR_INTERPOLATION_BITS == 4
-#define CHECKSUM 0x8B925154
+#define CHECKSUM 0xA227306B
 #else
 #define CHECKSUM 0x00000000
 #endif
diff --git a/pixman/test/lowlevel-blt-bench.c b/pixman/test/lowlevel-blt-bench.c
index 2f97b7b24..7336fa0d5 100644
--- a/pixman/test/lowlevel-blt-bench.c
+++ b/pixman/test/lowlevel-blt-bench.c
@@ -630,6 +630,8 @@ tests_tbl[] =
     { "src_0565_0565",         PIXMAN_r5g6b5,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
     { "src_1555_0565",         PIXMAN_a1r5g5b5,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
     { "src_0565_1555",         PIXMAN_r5g6b5,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "src_8_8",               PIXMAN_a8,          0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8 },
+    { "src_n_8",               PIXMAN_a8,          1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8 },
     { "src_n_8_0565",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
     { "src_n_8_1555",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a1r5g5b5 },
     { "src_n_8_4444",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a4r4g4b4 },
@@ -772,7 +774,7 @@ main (int argc, char *argv[])
 
     for (i = 0; i < ARRAY_LENGTH (tests_tbl); i++)
     {
-	if (strcmp (pattern, "all") == 0 || strstr (tests_tbl[i].testname, pattern))
+	if (strcmp (pattern, "all") == 0 || strcmp (tests_tbl[i].testname, pattern) == 0)
 	{
 	    bench_composite (tests_tbl[i].testname,
 			     tests_tbl[i].src_fmt,
diff --git a/pixman/test/matrix-test.c b/pixman/test/matrix-test.c
new file mode 100644
index 000000000..8437dd291
--- /dev/null
+++ b/pixman/test/matrix-test.c
@@ -0,0 +1,186 @@
+/*
+ * Copyright © 2012 Siarhei Siamashka <siarhei.siamashka@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "utils.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef HAVE_FLOAT128
+
+#define pixman_fixed_to_float128(x) (((__float128)(x)) / 65536.0Q)
+
+typedef struct { __float128 v[3]; } pixman_vector_f128_t;
+typedef struct { __float128 m[3][3]; } pixman_transform_f128_t;
+
+pixman_bool_t
+pixman_transform_point_f128 (const pixman_transform_f128_t *t,
+                             const pixman_vector_f128_t    *v,
+                             pixman_vector_f128_t          *result)
+{
+    int i;
+    for (i = 0; i < 3; i++)
+    {
+        result->v[i] = t->m[i][0] * v->v[0] +
+                       t->m[i][1] * v->v[1] +
+                       t->m[i][2] * v->v[2];
+    }
+    if (result->v[2] != 0)
+    {
+        result->v[0] /= result->v[2];
+        result->v[1] /= result->v[2];
+        result->v[2] = 1;
+        return TRUE;
+    }
+    else
+    {
+        return FALSE;
+    }
+}
+
+pixman_bool_t does_it_fit_fixed_48_16 (__float128 x)
+{
+    if (x >= 65536.0Q * 65536.0Q * 32768.0Q)
+        return FALSE;
+    if (x <= -65536.0Q * 65536.0Q * 32768.0Q)
+        return FALSE;
+    return TRUE;
+}
+
+#endif
+
+uint32_t
+test_matrix (int testnum, int verbose)
+{
+    uint32_t crc32 = 0;
+    int i, j, k;
+    pixman_bool_t is_affine;
+
+    prng_srand (testnum);
+
+    for (i = 0; i < 100; i++)
+    {
+        pixman_bool_t           transform_ok;
+        pixman_transform_t      ti;
+        pixman_vector_48_16_t   vi, result_i;
+#ifdef HAVE_FLOAT128
+        pixman_transform_f128_t tf;
+        pixman_vector_f128_t    vf, result_f;
+#endif
+        prng_randmemset (&ti, sizeof(ti), 0);
+        prng_randmemset (&vi, sizeof(vi), 0);
+
+        for (j = 0; j < 3; j++)
+        {
+            /* make sure that "vi" contains 31.16 fixed point data */
+            vi.v[j] >>= 17;
+            /* and apply random shift */
+            if (prng_rand_n (3) == 0)
+                vi.v[j] >>= prng_rand_n (46);
+        }
+
+        if (prng_rand_n (2))
+        {
+            /* random shift for the matrix */
+            for (j = 0; j < 3; j++)
+                for (k = 0; k < 3; k++)
+                    ti.matrix[j][k] >>= prng_rand_n (30);
+        }
+
+        if (prng_rand_n (2))
+        {
+            /* affine matrix */
+            ti.matrix[2][0] = 0;
+            ti.matrix[2][1] = 0;
+            ti.matrix[2][2] = pixman_fixed_1;
+        }
+
+        if (prng_rand_n (2))
+        {
+            /* cartesian coordinates */
+            vi.v[2] = pixman_fixed_1;
+        }
+
+        is_affine = (ti.matrix[2][0] == 0 && ti.matrix[2][1] == 0 &&
+                     ti.matrix[2][2] == pixman_fixed_1 &&
+                     vi.v[2] == pixman_fixed_1);
+
+        transform_ok = TRUE;
+        if (is_affine && prng_rand_n (2))
+            pixman_transform_point_31_16_affine (&ti, &vi, &result_i);
+        else
+            transform_ok = pixman_transform_point_31_16 (&ti, &vi, &result_i);
+
+        crc32 = compute_crc32 (crc32, &result_i, sizeof(result_i));
+
+#ifdef HAVE_FLOAT128
+        /* compare with a reference 128-bit floating point implementation */
+        for (j = 0; j < 3; j++)
+        {
+            vf.v[j] = pixman_fixed_to_float128 (vi.v[j]);
+            for (k = 0; k < 3; k++)
+            {
+                tf.m[j][k] = pixman_fixed_to_float128 (ti.matrix[j][k]);
+            }
+        }
+
+        if (pixman_transform_point_f128 (&tf, &vf, &result_f))
+        {
+            if (transform_ok ||
+                (does_it_fit_fixed_48_16 (result_f.v[0]) &&
+                 does_it_fit_fixed_48_16 (result_f.v[1]) &&
+                 does_it_fit_fixed_48_16 (result_f.v[2])))
+            {
+                for (j = 0; j < 3; j++)
+                {
+                    double diff = fabs (result_f.v[j] -
+                                        pixman_fixed_to_float128 (result_i.v[j]));
+
+                    if (is_affine && diff > (0.51 / 65536.0))
+                    {
+                        printf ("%d:%d: bad precision for affine (%.12f)\n",
+                               testnum, i, diff);
+                        abort ();
+                    }
+                    else if (diff > (0.71 / 65536.0))
+                    {
+                        printf ("%d:%d: bad precision for projective (%.12f)\n",
+                               testnum, i, diff);
+                        abort ();
+                    }
+                }
+            }
+        }
+#endif
+    }
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    return fuzzer_test_main ("matrix", 20000,
+			     0xBEBF98C3,
+			     test_matrix, argc, argv);
+}