From fdef5bff99e6079f64bc6b91c91b42195c85adeb Mon Sep 17 00:00:00 2001
From: marha <marha@users.sourceforge.net>
Date: Mon, 2 Jul 2012 08:47:42 +0200
Subject: mesa pixman xserver git update 2 Jul 2012

---
 mesalib/src/gallium/auxiliary/Android.mk           |   3 +-
 mesalib/src/gallium/auxiliary/Makefile             |   2 -
 mesalib/src/gallium/auxiliary/Makefile.sources     |   3 +-
 mesalib/src/gallium/auxiliary/SConscript           |   7 -
 mesalib/src/gallium/auxiliary/util/.gitignore      |   1 -
 mesalib/src/gallium/auxiliary/util/u_format.c      |  32 +++
 mesalib/src/gallium/auxiliary/util/u_format.h      |   7 +
 .../src/gallium/auxiliary/util/u_format_tests.c    |  40 ++-
 mesalib/src/gallium/auxiliary/util/u_half.h        | 101 +++++---
 mesalib/src/gallium/auxiliary/util/u_half.py       | 179 -------------
 mesalib/src/gallium/auxiliary/util/u_math.h        | 104 +++++++-
 mesalib/src/mesa/main/mtypes.h                     |   6 -
 mesalib/src/mesa/state_tracker/st_cb_fbo.c         |  10 +-
 pixman/pixman/Makefile.am                          |   1 +
 pixman/pixman/loongson-mmintrin.h                  | 116 +++++++++
 pixman/pixman/pixman-arm-neon-asm-bilinear.S       | 119 ++++-----
 pixman/pixman/pixman-arm-neon-asm.S                | 159 ++++++------
 pixman/pixman/pixman-bits-image.c                  |  16 +-
 pixman/pixman/pixman-inlines.h                     |  37 ++-
 pixman/pixman/pixman-mips-dspr2-asm.S              |   9 +-
 pixman/pixman/pixman-mmx.c                         | 280 ++++++++++++++++++++-
 pixman/pixman/pixman-private.h                     |  22 +-
 pixman/pixman/pixman-sse2.c                        |  45 +++-
 pixman/test/affine-test.c                          |  12 +-
 pixman/test/scaling-test.c                         |  12 +-
 xorg-server/randr/rrscreen.c                       |   2 +-
 26 files changed, 898 insertions(+), 427 deletions(-)
 delete mode 100644 mesalib/src/gallium/auxiliary/util/u_half.py

diff --git a/mesalib/src/gallium/auxiliary/Android.mk b/mesalib/src/gallium/auxiliary/Android.mk
index 0c37dd31a..11fc2256a 100644
--- a/mesalib/src/gallium/auxiliary/Android.mk
+++ b/mesalib/src/gallium/auxiliary/Android.mk
@@ -44,8 +44,7 @@ $(LOCAL_GENERATED_SOURCES): PRIVATE_CUSTOM_TOOL = $(PRIVATE_PYTHON) $^ > $@
 
 $(intermediates)/indices/u_indices_gen.c \
 $(intermediates)/indices/u_unfilled_gen.c \
-$(intermediates)/util/u_format_srgb.c \
-$(intermediates)/util/u_half.c: $(intermediates)/%.c: $(LOCAL_PATH)/%.py
+$(intermediates)/util/u_format_srgb.c: $(intermediates)/%.c: $(LOCAL_PATH)/%.py
 	$(transform-generated-source)
 
 $(intermediates)/util/u_format_table.c: $(intermediates)/%.c: $(LOCAL_PATH)/%.py $(LOCAL_PATH)/util/u_format.csv
diff --git a/mesalib/src/gallium/auxiliary/Makefile b/mesalib/src/gallium/auxiliary/Makefile
index a70ae7384..3ba3f9c40 100644
--- a/mesalib/src/gallium/auxiliary/Makefile
+++ b/mesalib/src/gallium/auxiliary/Makefile
@@ -39,6 +39,4 @@ util/u_format_srgb.c: util/u_format_srgb.py
 util/u_format_table.c: util/u_format_table.py util/u_format_pack.py util/u_format_parse.py util/u_format.csv
 	$(PYTHON2) util/u_format_table.py util/u_format.csv > $@
 
-util/u_half.c: util/u_half.py
-	$(PYTHON2) util/u_half.py > $@
 # DO NOT DELETE
diff --git a/mesalib/src/gallium/auxiliary/Makefile.sources b/mesalib/src/gallium/auxiliary/Makefile.sources
index 277428b38..28a176d68 100644
--- a/mesalib/src/gallium/auxiliary/Makefile.sources
+++ b/mesalib/src/gallium/auxiliary/Makefile.sources
@@ -155,8 +155,7 @@ GENERATED_SOURCES := \
 	indices/u_indices_gen.c \
 	indices/u_unfilled_gen.c \
 	util/u_format_srgb.c \
-	util/u_format_table.c \
-	util/u_half.c
+	util/u_format_table.c
 
 GALLIVM_SOURCES := \
         gallivm/lp_bld_arit.c \
diff --git a/mesalib/src/gallium/auxiliary/SConscript b/mesalib/src/gallium/auxiliary/SConscript
index 07c420e13..bfd5ec34c 100644
--- a/mesalib/src/gallium/auxiliary/SConscript
+++ b/mesalib/src/gallium/auxiliary/SConscript
@@ -35,13 +35,6 @@ env.CodeGenerate(
     command = python_cmd + ' $SCRIPT $SOURCE > $TARGET'
 )
 
-env.CodeGenerate(
-    target = 'util/u_half.c',
-    script = 'util/u_half.py',
-    source = [],
-    command = python_cmd + ' $SCRIPT > $TARGET'
-)
-
 env.Depends('util/u_format_table.c', [
     '#src/gallium/auxiliary/util/u_format_parse.py',
     'util/u_format_pack.py', 
diff --git a/mesalib/src/gallium/auxiliary/util/.gitignore b/mesalib/src/gallium/auxiliary/util/.gitignore
index 5dd0408ef..da74de623 100644
--- a/mesalib/src/gallium/auxiliary/util/.gitignore
+++ b/mesalib/src/gallium/auxiliary/util/.gitignore
@@ -1,3 +1,2 @@
 u_format_srgb.c
 u_format_table.c
-u_half.c
diff --git a/mesalib/src/gallium/auxiliary/util/u_format.c b/mesalib/src/gallium/auxiliary/util/u_format.c
index cfc4a17a0..6f4529835 100644
--- a/mesalib/src/gallium/auxiliary/util/u_format.c
+++ b/mesalib/src/gallium/auxiliary/util/u_format.c
@@ -158,6 +158,38 @@ util_format_is_pure_uint(enum pipe_format format)
    return (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED && desc->channel[i].pure_integer) ? TRUE : FALSE;
 }
 
+boolean
+util_format_is_array(const struct util_format_description *desc)
+{
+   unsigned chan;
+
+   if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
+       desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB ||
+       desc->block.width != 1 ||
+       desc->block.height != 1) {
+      return FALSE;
+   }
+
+   for (chan = 0; chan < desc->nr_channels; ++chan) {
+      if (desc->swizzle[chan] != chan)
+         return FALSE;
+
+      if (desc->channel[chan].type != desc->channel[0].type)
+         return FALSE;
+
+      if (desc->channel[chan].normalized != desc->channel[0].normalized)
+         return FALSE;
+
+      if (desc->channel[chan].pure_integer != desc->channel[0].pure_integer)
+         return FALSE;
+
+      if (desc->channel[chan].size != desc->channel[0].size)
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
 boolean
 util_format_is_luminance_alpha(enum pipe_format format)
 {
diff --git a/mesalib/src/gallium/auxiliary/util/u_format.h b/mesalib/src/gallium/auxiliary/util/u_format.h
index 1718fb5e2..e35e164b4 100644
--- a/mesalib/src/gallium/auxiliary/util/u_format.h
+++ b/mesalib/src/gallium/auxiliary/util/u_format.h
@@ -590,6 +590,13 @@ util_format_is_pure_sint(enum pipe_format format);
 boolean
 util_format_is_pure_uint(enum pipe_format format);
 
+/**
+ * Whether the format is a simple array format where all channels
+ * are of the same type and can be loaded from memory as a vector
+ */
+boolean
+util_format_is_array(const struct util_format_description *desc);
+
 /**
  * Check if the src format can be blitted to the destination format with
  * a simple memcpy.  For example, blitting from RGBA to RGBx is OK, but not
diff --git a/mesalib/src/gallium/auxiliary/util/u_format_tests.c b/mesalib/src/gallium/auxiliary/util/u_format_tests.c
index fc29d8d48..457fda6c8 100644
--- a/mesalib/src/gallium/auxiliary/util/u_format_tests.c
+++ b/mesalib/src/gallium/auxiliary/util/u_format_tests.c
@@ -26,6 +26,9 @@
  **************************************************************************/
 
 
+#include <float.h>
+
+#include "pipe/p_config.h"
 #include "u_memory.h"
 #include "u_format_tests.h"
 
@@ -63,6 +66,9 @@
        {{ 0,  0,  0,  0}, { 0,  0,  0,  0}, {0, 0, 0, 0}, {0, 0, 0, 0}}}
 
 
+#define NAN (0.0 / 0.0)
+#define INF (1.0 / 0.0)
+
 /**
  * Test cases.
  *
@@ -876,7 +882,39 @@ util_format_test_cases[] =
     * Half float formats
     */
 
-   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(  0.0, 0.0, 0.0, 1.0)},
+   /* Minimum positive normal */
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x0400), UNPACKED_1x1( 6.10352E-5, 0.0, 0.0, 1.0)},
+
+   /* Max denormal */
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x03FF), UNPACKED_1x1( 6.09756E-5, 0.0, 0.0, 1.0)},
+
+   /* Minimum positive denormal */
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x0001), UNPACKED_1x1( 5.96046E-8, 0.0, 0.0, 1.0)},
+
+   /* Min representable value */
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xfbff), UNPACKED_1x1(   -65504.0, 0.0, 0.0, 1.0)},
+
+   /* Max representable value */
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x7bff), UNPACKED_1x1(    65504.0, 0.0, 0.0, 1.0)},
+
+#if !defined(PIPE_CC_MSVC)
+
+   /* NaNs */
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x7c01), UNPACKED_1x1(        NAN, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xfc01), UNPACKED_1x1(       -NAN, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x7fff), UNPACKED_1x1(        NAN, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(       -NAN, 0.0, 0.0, 1.0)},
+
+   /* Inf */
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x7c00), UNPACKED_1x1(        INF, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xfc00), UNPACKED_1x1(       -INF, 0.0, 0.0, 1.0)},
+
+#endif
+
+   /* Zero, ignore sign */
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0x7fff), PACKED_1x16(0x8000), UNPACKED_1x1( -0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0x7fff), PACKED_1x16(0x0000), UNPACKED_1x1(  0.0, 0.0, 0.0, 1.0)},
+
    {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x3c00), UNPACKED_1x1(  1.0, 0.0, 0.0, 1.0)},
    {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xbc00), UNPACKED_1x1( -1.0, 0.0, 0.0, 1.0)},
 
diff --git a/mesalib/src/gallium/auxiliary/util/u_half.h b/mesalib/src/gallium/auxiliary/util/u_half.h
index ad030e90c..f7009f548 100644
--- a/mesalib/src/gallium/auxiliary/util/u_half.h
+++ b/mesalib/src/gallium/auxiliary/util/u_half.h
@@ -35,51 +35,84 @@
 extern "C" {
 #endif
 
-extern const uint32_t util_half_to_float_mantissa_table[2048];
-extern const uint32_t util_half_to_float_exponent_table[64];
-extern const uint32_t util_half_to_float_offset_table[64];
-extern const uint16_t util_float_to_half_base_table[512];
-extern const uint8_t util_float_to_half_shift_table[512];
-
 /*
- * Note that if the half float is a signaling NaN, the x87 FPU will turn
- * it into a quiet NaN immediately upon loading into a float.
- *
- * Additionally, denormals may be flushed to zero.
+ * References for float <-> half conversions
  *
- * To avoid this, use the floatui functions instead of the float ones
- * when just doing conversion rather than computation on the resulting
- * floats.
+ *  http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
+ *  https://gist.github.com/2156668
+ *  https://gist.github.com/2144712
  */
 
-static INLINE uint32_t
-util_half_to_floatui(uint16_t h)
+static INLINE uint16_t
+util_float_to_half(float f)
 {
-   unsigned exp = h >> 10;
-   return util_half_to_float_mantissa_table[util_half_to_float_offset_table[exp] + (h & 0x3ff)] + util_half_to_float_exponent_table[exp];
+   uint32_t sign_mask  = 0x80000000;
+   uint32_t round_mask = ~0xfff;
+   uint32_t f32inf = 0xff << 23;
+   uint32_t f16inf = 0x1f << 23;
+   uint32_t sign;
+   union fi magic;
+   union fi f32;
+   uint16_t f16;
+
+   magic.ui = 0xf << 23;
+
+   f32.f = f;
+
+   /* Sign */
+   sign = f32.ui & sign_mask;
+   f32.ui ^= sign;
+
+   if (f32.ui == f32inf) {
+      /* Inf */
+      f16 = 0x7c00;
+   } else if (f32.ui > f32inf) {
+      /* NaN */
+      f16 = 0x7e00;
+   } else {
+      /* Number */
+      f32.ui &= round_mask;
+      f32.f  *= magic.f;
+      f32.ui -= round_mask;
+
+      /* Clamp to infinity if overflowed */
+      if (f32.ui > f16inf)
+         f32.ui = f16inf;
+
+      f16 = f32.ui >> 13;
+   }
+
+   /* Sign */
+   f16 |= sign >> 16;
+
+   return f16;
 }
 
 static INLINE float
-util_half_to_float(uint16_t h)
+util_half_to_float(uint16_t f16)
 {
-   union fi r;
-   r.ui = util_half_to_floatui(h);
-   return r.f;
-}
+   union fi infnan;
+   union fi magic;
+   union fi f32;
 
-static INLINE uint16_t
-util_floatui_to_half(uint32_t v)
-{
-   unsigned signexp = v >> 23;
-   return util_float_to_half_base_table[signexp] + ((v & 0x007fffff) >> util_float_to_half_shift_table[signexp]);
-}
+   infnan.ui = 0x8f << 23;
+   infnan.f = 65536.0f;
+   magic.ui  = 0xef << 23;
 
-static INLINE uint16_t
-util_float_to_half(float f)
-{
-   union fi i;
-   i.f = f;
-   return util_floatui_to_half(i.ui);
+   /* Exponent / Mantissa */
+   f32.ui = (f16 & 0x7fff) << 13;
+
+   /* Adjust */
+   f32.f *= magic.f;
+
+   /* Inf / NaN */
+   if (f32.f >= infnan.f)
+      f32.ui |= 0xff << 23;
+
+   /* Sign */
+   f32.ui |= (f16 & 0x8000) << 16;
+
+   return f32.f;
 }
 
 #ifdef __cplusplus
diff --git a/mesalib/src/gallium/auxiliary/util/u_half.py b/mesalib/src/gallium/auxiliary/util/u_half.py
deleted file mode 100644
index 915cf3b92..000000000
--- a/mesalib/src/gallium/auxiliary/util/u_half.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# Copyright 2010 Luca Barbieri
-#
-# Permission is hereby granted, free of charge, to any person obtaining
-# a copy of this software and associated documentation files (the
-# "Software"), to deal in the Software without restriction, including
-# without limitation the rights to use, copy, modify, merge, publish,
-# distribute, sublicense, and/or sell copies of the Software, and to
-# permit persons to whom the Software is furnished to do so, subject to
-# the following conditions:
-#
-# The above copyright notice and this permission notice (including the
-# next paragraph) shall be included in all copies or substantial
-# portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-# IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-#
-# *************************************************************************
-
-# The code is a reimplementation of the algorithm in
-#  www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
-# "Fast Half Float Conversions" by Jeroen van der Zijp, Nov 2008
-#
-# The table contents have been slightly changed so that the exponent
-# bias is now in the exponent table instead of the mantissa table (mostly
-# for cosmetic reasons, and because it theoretically allows a variant
-# that flushes denormal to zero but uses a mantissa table with 24-bit
-# entries).
-#
-# The tables are also constructed slightly differently.
-#
-
-# Note that using a 64K * 4 table is a terrible idea since it will not fit
-# in the L1 cache and will massively pollute the L2 cache as well
-#
-# These should instead fit in the L1 cache.
-#
-# TODO: we could use a denormal bias table instead of the mantissa/offset
-# tables: this would reduce the L1 cache usage from 8704 to 2304 bytes
-# but would involve more computation
-#
-# Note however that if denormals are never encountered, the L1 cache usage
-# is only about 4608 bytes anyway.
-
-table_index = None
-table_length = None
-
-def begin(t, n, l):
-	global table_length
-	global table_index
-	table_index = 0
-	table_length = l
-	print
-	print "const " + t + " " + n + "[" + str(l) + "] = {"
-
-def value(v):
-	global table_index
-	table_index += 1
-	print "\t" + hex(v) + ","
-
-def end():
-	global table_length
-	global table_index
-	print "};"
-	assert table_index == table_length
-
-print "/* This file is autogenerated by u_half.py. Do not edit directly. */"
-print "#include \"util/u_half.h\""
-
-begin("uint32_t", "util_half_to_float_mantissa_table", 2048)
-# zero
-value(0)
-
-# denormals
-for i in xrange(1, 1024):
-	m = i << 13
-	e = 0
-
-	# normalize number
-	while (m & 0x00800000) == 0:
-		e -= 0x00800000
-		m <<= 1
-
-	m &= ~0x00800000
-	e += 0x38800000
-	value(m | e)
-
-# normals
-for i in xrange(1024, 2048):
-	value((i - 1024) << 13)
-end()
-
-begin("uint32_t", "util_half_to_float_exponent_table", 64)
-# positive zero or denormals
-value(0)
-
-# positive numbers
-for i in xrange(1, 31):
-	value(0x38000000 + (i << 23))
-
-# positive infinity/NaN
-value(0x7f800000)
-
-# negative zero or denormals
-value(0x80000000)
-
-# negative numbers
-for i in range(33, 63):
-	value(0xb8000000 + ((i - 32) << 23))
-
-# negative infinity/NaN
-value(0xff800000)
-end()
-
-begin("uint32_t", "util_half_to_float_offset_table", 64)
-# positive zero or denormals
-value(0)
-
-# positive normals
-for i in range(1, 32):
-	value(1024)
-
-# negative zero or denormals
-value(0)
-
-# negative normals
-for i in xrange(33, 64):
-	value(1024)
-end()
-
-begin("uint16_t", "util_float_to_half_base_table", 512)
-for sign in (0, 0x8000):
-	# very small numbers mapping to zero
-	for i in xrange(-127, -24):
-		value(sign | 0)
-
-	# small numbers mapping to denormals
-	for i in xrange(-24, -14):
-		value(sign | (0x400 >> (-14 -i)))
-
-	# normal numbers
-	for i in xrange(-14, 16):
-		value(sign | ((i + 15) << 10))
-
-	# large numbers mapping to infinity
-	for i in xrange(16, 128):
-		value(sign | 0x7c00)
-
-	# infinity and NaNs
-	value(sign | 0x7c00)
-end()
-
-begin("uint8_t", "util_float_to_half_shift_table", 512)
-for sign in (0, 0x8000):
-	# very small numbers mapping to zero
-	for i in xrange(-127, -24):
-		value(24)
-
-	# small numbers mapping to denormals
-	for i in xrange(-24, -14):
-		value(-1 - i)
-
-	# normal numbers
-	for i in xrange(-14, 16):
-		value(13)
-
-	# large numbers mapping to infinity
-	for i in xrange(16, 128):
-		value(24)
-
-	# infinity and NaNs
-	value(13)
-end()
-
diff --git a/mesalib/src/gallium/auxiliary/util/u_math.h b/mesalib/src/gallium/auxiliary/util/u_math.h
index f6196665f..724b136b5 100644
--- a/mesalib/src/gallium/auxiliary/util/u_math.h
+++ b/mesalib/src/gallium/auxiliary/util/u_math.h
@@ -183,6 +183,13 @@ union fi {
 };
 
 
+union di {
+   double d;
+   int64_t i;
+   uint64_t ui;
+};
+
+
 /**
  * Fast version of 2^x
  * Identity: exp2(a + b) = exp2(a) * exp2(b)
@@ -325,14 +332,107 @@ util_is_approx(float a, float b, float tol)
 
 
 /**
- * Test if x is NaN or +/- infinity.
+ * util_is_X_inf_or_nan = test if x is NaN or +/- Inf
+ * util_is_X_nan        = test if x is NaN
+ * util_X_inf_sign      = return +1 for +Inf, -1 for -Inf, or 0 for not Inf
+ *
+ * NaN can be checked with x != x, however this fails with the fast math flag
+ **/
+
+
+/**
+ * Single-float
  */
 static INLINE boolean
 util_is_inf_or_nan(float x)
 {
    union fi tmp;
    tmp.f = x;
-   return !(int)((unsigned int)((tmp.i & 0x7fffffff)-0x7f800000) >> 31);
+   return (tmp.ui & 0x7f800000) == 0x7f800000;
+}
+
+
+static INLINE boolean
+util_is_nan(float x)
+{
+   union fi tmp;
+   tmp.f = x;
+   return (tmp.ui & 0x7fffffff) > 0x7f800000;
+}
+
+
+static INLINE int
+util_inf_sign(float x)
+{
+   union fi tmp;
+   tmp.f = x;
+   if ((tmp.ui & 0x7fffffff) != 0x7f800000) {
+      return 0;
+   }
+
+   return (x < 0) ? -1 : 1;
+}
+
+
+/**
+ * Double-float
+ */
+static INLINE boolean
+util_is_double_inf_or_nan(double x)
+{
+   union di tmp;
+   tmp.d = x;
+   return (tmp.ui & 0x7ff0000000000000) == 0x7ff0000000000000;
+}
+
+
+static INLINE boolean
+util_is_double_nan(double x)
+{
+   union di tmp;
+   tmp.d = x;
+   return (tmp.ui & 0x7fffffffffffffff) > 0x7ff0000000000000;
+}
+
+
+static INLINE int
+util_double_inf_sign(double x)
+{
+   union di tmp;
+   tmp.d = x;
+   if ((tmp.ui & 0x7fffffffffffffff) != 0x7ff0000000000000) {
+      return 0;
+   }
+
+   return (x < 0) ? -1 : 1;
+}
+
+
+/**
+ * Half-float
+ */
+static INLINE boolean
+util_is_half_inf_or_nan(int16_t x)
+{
+   return (x & 0x7c00) == 0x7c00;
+}
+
+
+static INLINE boolean
+util_is_half_nan(int16_t x)
+{
+   return (x & 0x7fff) > 0x7c00;
+}
+
+
+static INLINE int
+util_half_inf_sign(int16_t x)
+{
+   if ((x & 0x7fff) != 0x7c00) {
+      return 0;
+   }
+
+   return (x < 0) ? -1 : 1;
 }
 
 
diff --git a/mesalib/src/mesa/main/mtypes.h b/mesalib/src/mesa/main/mtypes.h
index 5768ed7cd..bdbb5137e 100644
--- a/mesalib/src/mesa/main/mtypes.h
+++ b/mesalib/src/mesa/main/mtypes.h
@@ -3200,12 +3200,6 @@ struct gl_dlist_state
    GLubyte ActiveMaterialSize[MAT_ATTRIB_MAX];
    GLfloat CurrentMaterial[MAT_ATTRIB_MAX][4];
 
-   GLubyte ActiveIndex;
-   GLfloat CurrentIndex;
-   
-   GLubyte ActiveEdgeFlag;
-   GLboolean CurrentEdgeFlag;
-
    struct {
       /* State known to have been set by the currently-compiling display
        * list.  Used to eliminate some redundant state changes.
diff --git a/mesalib/src/mesa/state_tracker/st_cb_fbo.c b/mesalib/src/mesa/state_tracker/st_cb_fbo.c
index 10f4e09cf..e1818abb9 100644
--- a/mesalib/src/mesa/state_tracker/st_cb_fbo.c
+++ b/mesalib/src/mesa/state_tracker/st_cb_fbo.c
@@ -57,10 +57,6 @@
 #include "util/u_surface.h"
 
 
-/** Set to 1 to enable extra debug code */
-#define ST_DEBUG_FBO 0
-
-
 static GLboolean
 st_renderbuffer_alloc_sw_storage(struct gl_context * ctx,
                                  struct gl_renderbuffer *rb,
@@ -480,9 +476,9 @@ st_finish_render_texture(struct gl_context *ctx,
 static void
 st_fbo_invalid(const char *reason)
 {
-#if ST_DEBUG_FBO
-   debug_printf("Invalid FBO: %s\n", reason);
-#endif
+   if (MESA_DEBUG_FLAGS & DEBUG_INCOMPLETE_FBO) {
+      _mesa_debug(NULL, "Invalid FBO: %s\n", reason);
+   }
 }
 
 
diff --git a/pixman/pixman/Makefile.am b/pixman/pixman/Makefile.am
index 1b232ad0f..deacf8728 100644
--- a/pixman/pixman/Makefile.am
+++ b/pixman/pixman/Makefile.am
@@ -92,6 +92,7 @@ endif
 
 # iwmmxt code
 if USE_ARM_IWMMXT
+libpixman_iwmmxt_la_SOURCES = pixman-mmx.c
 noinst_LTLIBRARIES += libpixman-iwmmxt.la
 libpixman_1_la_LIBADD += libpixman-iwmmxt.la
 
diff --git a/pixman/pixman/loongson-mmintrin.h b/pixman/pixman/loongson-mmintrin.h
index 1a114fe0f..086c6e0f1 100644
--- a/pixman/pixman/loongson-mmintrin.h
+++ b/pixman/pixman/loongson-mmintrin.h
@@ -44,6 +44,28 @@ _mm_setzero_si64 (void)
 	return 0.0;
 }
 
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("paddh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi32 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("paddw %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
 {
@@ -149,6 +171,78 @@ _mm_packs_pu16 (__m64 __m1, __m64 __m2)
 	return ret;
 }
 
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_pi32 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("packsswh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
+ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi16 (uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0)
+{
+	if (__builtin_constant_p (__w3) &&
+	    __builtin_constant_p (__w2) &&
+	    __builtin_constant_p (__w1) &&
+	    __builtin_constant_p (__w0))
+	{
+		uint64_t val = ((uint64_t)__w3 << 48)
+			     | ((uint64_t)__w2 << 32)
+			     | ((uint64_t)__w1 << 16)
+			     | ((uint64_t)__w0 <<  0);
+		return *(__m64 *)&val;
+	}
+	else if (__w3 == __w2 && __w2 == __w1 && __w1 == __w0)
+	{
+		/* TODO: handle other cases */
+		uint64_t val = __w3;
+		uint64_t imm = _MM_SHUFFLE (0, 0, 0, 0);
+		__m64 ret;
+		asm("pshufh %0, %1, %2\n\t"
+		    : "=f" (ret)
+		    : "f" (*(__m64 *)&val), "f" (*(__m64 *)&imm)
+		);
+		return ret;
+	}
+	uint64_t val = ((uint64_t)__w3 << 48)
+		     | ((uint64_t)__w2 << 32)
+		     | ((uint64_t)__w1 << 16)
+		     | ((uint64_t)__w0 <<  0);
+	return *(__m64 *)&val;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi32 (unsigned __i1, unsigned __i0)
+{
+	if (__builtin_constant_p (__i1) &&
+	    __builtin_constant_p (__i0))
+	{
+		uint64_t val = ((uint64_t)__i1 << 32)
+			     | ((uint64_t)__i0 <<  0);
+		return *(__m64 *)&val;
+	}
+	else if (__i1 == __i0)
+	{
+		uint64_t imm = _MM_SHUFFLE (1, 0, 1, 0);
+		__m64 ret;
+		asm("pshufh %0, %1, %2\n\t"
+		    : "=f" (ret)
+		    : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
+		);
+		return ret;
+	}
+	uint64_t val = ((uint64_t)__i1 << 32)
+		     | ((uint64_t)__i0 <<  0);
+	return *(__m64 *)&val;
+}
+#undef _MM_SHUFFLE
+
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_shuffle_pi16 (__m64 __m, int64_t __n)
 {
@@ -192,6 +286,17 @@ _mm_srli_pi16 (__m64 __m, int64_t __count)
 	return ret;
 }
 
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_pi32 (__m64 __m, int64_t __count)
+{
+	__m64 ret;
+	asm("psrlw %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__count)
+	);
+	return ret;
+}
+
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_si64 (__m64 __m, int64_t __count)
 {
@@ -203,6 +308,17 @@ _mm_srli_si64 (__m64 __m, int64_t __count)
 	return ret;
 }
 
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("psubh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
 {
diff --git a/pixman/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman/pixman-arm-neon-asm-bilinear.S
index f7913adb7..e37b5c298 100644
--- a/pixman/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman/pixman-arm-neon-asm-bilinear.S
@@ -64,6 +64,7 @@
 .altmacro
 .p2align 2
 
+#include "pixman-private.h"
 #include "pixman-arm-neon-asm.h"
 
 /*
@@ -488,12 +489,12 @@ fname:
     vmull.u8  q1, d0, d28
     vmlal.u8  q1, d1, d29
     /* 5 cycles bubble */
-    vshll.u16 q0, d2, #8
+    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q0, d2, d30
     vmlal.u16 q0, d3, d30
     /* 5 cycles bubble */
     bilinear_duplicate_mask mask_fmt, 1, d4
-    vshrn.u32 d0, q0, #16
+    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
     /* 3 cycles bubble */
     vmovn.u16 d0, q0
     /* 1 cycle bubble */
@@ -514,16 +515,16 @@ fname:
                 q1, q11, d0, d1, d20, d21, d22, d23
     bilinear_load_mask mask_fmt, 2, d4
     bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
-    vshll.u16 q0, d2, #8
+    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q0, d2, d30
     vmlal.u16 q0, d3, d30
-    vshll.u16 q10, d22, #8
+    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q10, d22, d31
     vmlal.u16 q10, d23, d31
-    vshrn.u32 d0, q0, #16
-    vshrn.u32 d1, q10, #16
+    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
     bilinear_duplicate_mask mask_fmt, 2, d4
-    vshr.u16  q15, q12, #8
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vadd.u16  q12, q12, q13
     vmovn.u16 d0, q0
     bilinear_interleave_src_dst \
@@ -544,29 +545,29 @@ fname:
                 q3, q9,  d4, d5, d16, d17, d18, d19
     pld       [TMP1, PF_OFFS]
     sub       TMP1, TMP1, STRIDE
-    vshll.u16 q0, d2, #8
+    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q0, d2, d30
     vmlal.u16 q0, d3, d30
-    vshll.u16 q10, d22, #8
+    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q10, d22, d31
     vmlal.u16 q10, d23, d31
-    vshr.u16  q15, q12, #8
-    vshll.u16 q2, d6, #8
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q2, d6, d30
     vmlal.u16 q2, d7, d30
-    vshll.u16 q8, d18, #8
+    vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
     bilinear_load_mask mask_fmt, 4, d22
     bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
     pld       [TMP1, PF_OFFS]
     vmlsl.u16 q8, d18, d31
     vmlal.u16 q8, d19, d31
     vadd.u16  q12, q12, q13
-    vshrn.u32 d0, q0, #16
-    vshrn.u32 d1, q10, #16
-    vshrn.u32 d4, q2, #16
-    vshrn.u32 d5, q8, #16
+    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
     bilinear_duplicate_mask mask_fmt, 4, d22
-    vshr.u16  q15, q12, #8
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vmovn.u16 d0, q0
     vmovn.u16 d1, q2
     vadd.u16  q12, q12, q13
@@ -694,13 +695,13 @@ pixman_asm_function fname
     blt       0f
     tst       OUT, #(1 << dst_bpp_shift)
     beq       0f
-    vshr.u16  q15, q12, #8
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vadd.u16  q12, q12, q13
     bilinear_process_last_pixel
     sub       WIDTH, WIDTH, #1
 0:
     vadd.u16  q13, q13, q13
-    vshr.u16  q15, q12, #8
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vadd.u16  q12, q12, q13
 
     cmp       WIDTH, #2
@@ -921,7 +922,7 @@ pixman_asm_function fname
     vmull.u8    q10, d22, d28
     vmlal.u8    q10, d23, d29
 
-    vshll.u16   q0, d16, #8
+    vshll.u16   q0, d16, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16   q0, d16, d30
     vmlal.u16   q0, d17, d30
 
@@ -932,27 +933,27 @@ pixman_asm_function fname
     vmull.u8    q11, d16, d28
     vmlal.u8    q11, d17, d29
 
-    vshll.u16   q1, d18, #8
+    vshll.u16   q1, d18, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16   q1, d18, d31
     vmlal.u16   q1, d19, d31
-    vshr.u16    q15, q12, #8
+    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vadd.u16    q12, q12, q13
 .endm
 
 .macro bilinear_over_8888_8888_process_pixblock_tail
-    vshll.u16   q2, d20, #8
+    vshll.u16   q2, d20, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16   q2, d20, d30
     vmlal.u16   q2, d21, d30
-    vshll.u16   q3, d22, #8
+    vshll.u16   q3, d22, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16   q3, d22, d31
     vmlal.u16   q3, d23, d31
-    vshrn.u32   d0, q0, #16
-    vshrn.u32   d1, q1, #16
+    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
     vld1.32     {d2, d3}, [OUT, :128]
     pld         [OUT, #(prefetch_offset * 4)]
-    vshrn.u32   d4, q2, #16
-    vshr.u16    q15, q12, #8
-    vshrn.u32   d5, q3, #16
+    vshrn.u32   d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32   d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
     vmovn.u16   d6, q0
     vmovn.u16   d7, q2
     vuzp.8      d6, d7
@@ -975,7 +976,7 @@ pixman_asm_function fname
 .endm
 
 .macro bilinear_over_8888_8888_process_pixblock_tail_head
-                                            vshll.u16   q2, d20, #8
+                                            vshll.u16   q2, d20, #BILINEAR_INTERPOLATION_BITS
     mov         TMP1, X, asr #16
     add         X, X, UX
     add         TMP1, TOP, TMP1, asl #2
@@ -984,21 +985,21 @@ pixman_asm_function fname
     add         X, X, UX
     add         TMP2, TOP, TMP2, asl #2
                                             vmlal.u16   q2, d21, d30
-                                            vshll.u16   q3, d22, #8
+                                            vshll.u16   q3, d22, #BILINEAR_INTERPOLATION_BITS
     vld1.32     {d20}, [TMP1], STRIDE
                                             vmlsl.u16   q3, d22, d31
                                             vmlal.u16   q3, d23, d31
     vld1.32     {d21}, [TMP1]
     vmull.u8    q8, d20, d28
     vmlal.u8    q8, d21, d29
-                                            vshrn.u32   d0, q0, #16
-                                            vshrn.u32   d1, q1, #16
+                                            vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+                                            vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
                                             vld1.32     {d2, d3}, [OUT, :128]
                                             pld         [OUT, PF_OFFS]
-                                            vshrn.u32   d4, q2, #16
-                                            vshr.u16    q15, q12, #8
+                                            vshrn.u32   d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+                                            vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vld1.32     {d22}, [TMP2], STRIDE
-                                            vshrn.u32   d5, q3, #16
+                                            vshrn.u32   d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
                                             vmovn.u16   d6, q0
     vld1.32     {d23}, [TMP2]
     vmull.u8    q9, d22, d28
@@ -1022,7 +1023,7 @@ pixman_asm_function fname
     vmlal.u8    q10, d23, d29
                                             vmull.u8    q11, d2, d4
                                             vmull.u8    q2, d3, d4
-    vshll.u16   q0, d16, #8
+    vshll.u16   q0, d16, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16   q0, d16, d30
                                             vrshr.u16   q1, q11, #8
     vmlal.u16   q0, d17, d30
@@ -1037,12 +1038,12 @@ pixman_asm_function fname
     vmull.u8    q11, d16, d28
     vmlal.u8    q11, d17, d29
                                             vuzp.8      d6, d7
-    vshll.u16   q1, d18, #8
+    vshll.u16   q1, d18, #BILINEAR_INTERPOLATION_BITS
                                             vuzp.8      d6, d7
     vmlsl.u16   q1, d18, d31
                                             vadd.u16    q12, q12, q13
     vmlal.u16   q1, d19, d31
-    vshr.u16    q15, q12, #8
+    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vadd.u16    q12, q12, q13
                                             vst1.32     {d6, d7}, [OUT, :128]!
 .endm
@@ -1081,14 +1082,14 @@ pixman_asm_function fname
     vmull.u8    q3, d2, d28
     vmlal.u8    q2, d1, d29
     vmlal.u8    q3, d3, d29
-    vshll.u16   q0, d4, #8
-    vshll.u16   q1, d6, #8
+    vshll.u16   q0, d4, #BILINEAR_INTERPOLATION_BITS
+    vshll.u16   q1, d6, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16   q0, d4, d30
     vmlsl.u16   q1, d6, d31
     vmlal.u16   q0, d5, d30
     vmlal.u16   q1, d7, d31
-    vshrn.u32   d0, q0, #16
-    vshrn.u32   d1, q1, #16
+    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
     vld1.32     {d2}, [TMP3], STRIDE
     vld1.32     {d3}, [TMP3]
     pld         [TMP4, PF_OFFS]
@@ -1099,7 +1100,7 @@ pixman_asm_function fname
     vmlal.u8    q3, d3, d29
     vmull.u8    q1, d4, d28
     vmlal.u8    q1, d5, d29
-    vshr.u16    q15, q12, #8
+    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vld1.32     {d22[0]}, [MASK]!
     pld         [MASK, #prefetch_offset]
     vadd.u16    q12, q12, q13
@@ -1107,17 +1108,17 @@ pixman_asm_function fname
 .endm
 
 .macro bilinear_over_8888_8_8888_process_pixblock_tail
-    vshll.u16   q9, d6, #8
-    vshll.u16   q10, d2, #8
+    vshll.u16   q9, d6, #BILINEAR_INTERPOLATION_BITS
+    vshll.u16   q10, d2, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16   q9, d6, d30
     vmlsl.u16   q10, d2, d31
     vmlal.u16   q9, d7, d30
     vmlal.u16   q10, d3, d31
-    vshr.u16    q15, q12, #8
+    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vadd.u16    q12, q12, q13
     vdup.32     d22, d22[0]
-    vshrn.u32   d18, q9, #16
-    vshrn.u32   d19, q10, #16
+    vshrn.u32   d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32   d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
     vmovn.u16   d17, q9
     vld1.32     {d18, d19}, [OUT, :128]
     pld         [OUT, PF_OFFS]
@@ -1146,11 +1147,11 @@ pixman_asm_function fname
 .endm
 
 .macro bilinear_over_8888_8_8888_process_pixblock_tail_head
-                                            vshll.u16   q9, d6, #8
+                                            vshll.u16   q9, d6, #BILINEAR_INTERPOLATION_BITS
     mov         TMP1, X, asr #16
     add         X, X, UX
     add         TMP1, TOP, TMP1, asl #2
-                                            vshll.u16   q10, d2, #8
+                                            vshll.u16   q10, d2, #BILINEAR_INTERPOLATION_BITS
     vld1.32     {d0}, [TMP1], STRIDE
     mov         TMP2, X, asr #16
     add         X, X, UX
@@ -1167,12 +1168,12 @@ pixman_asm_function fname
     mov         TMP4, X, asr #16
     add         X, X, UX
     add         TMP4, TOP, TMP4, asl #2
-                                            vshr.u16    q15, q12, #8
+                                            vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
                                             vadd.u16    q12, q12, q13
     vld1.32     {d3}, [TMP2]
                                             vdup.32     d22, d22[0]
-                                            vshrn.u32   d18, q9, #16
-                                            vshrn.u32   d19, q10, #16
+                                            vshrn.u32   d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
+                                            vshrn.u32   d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
     vmull.u8    q2, d0, d28
     vmull.u8    q3, d2, d28
                                             vmovn.u16   d17, q9
@@ -1182,8 +1183,8 @@ pixman_asm_function fname
     vmlal.u8    q3, d3, d29
                                             vuzp.8      d16, d17
                                             vuzp.8      d18, d19
-    vshll.u16   q0, d4, #8
-    vshll.u16   q1, d6, #8
+    vshll.u16   q0, d4, #BILINEAR_INTERPOLATION_BITS
+    vshll.u16   q1, d6, #BILINEAR_INTERPOLATION_BITS
                                             vuzp.8      d16, d17
                                             vuzp.8      d18, d19
     vmlsl.u16   q0, d4, d30
@@ -1194,8 +1195,8 @@ pixman_asm_function fname
     vmlal.u16   q1, d7, d31
                                             vrsra.u16   q10, q10, #8
                                             vrsra.u16   q11, q11, #8
-    vshrn.u32   d0, q0, #16
-    vshrn.u32   d1, q1, #16
+    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
                                             vrshrn.u16  d16, q10, #8
                                             vrshrn.u16  d17, q11, #8
     vld1.32     {d2}, [TMP3], STRIDE
@@ -1216,7 +1217,7 @@ pixman_asm_function fname
                                             vraddhn.u16 d18, q9, q10
                                             vraddhn.u16 d19, q15, q11
     vmlal.u8    q1, d5, d29
-    vshr.u16    q15, q12, #8
+    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
                                             vqadd.u8    q9, q8, q9
     vld1.32     {d22[0]}, [MASK]!
                                             vuzp.8      d18, d19
diff --git a/pixman/pixman/pixman-arm-neon-asm.S b/pixman/pixman/pixman-arm-neon-asm.S
index 87aae1d55..187197dc3 100644
--- a/pixman/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman/pixman-arm-neon-asm.S
@@ -49,6 +49,7 @@
     .altmacro
     .p2align 2
 
+#include "pixman-private.h"
 #include "pixman-arm-neon-asm.h"
 
 /* Global configuration options and preferences */
@@ -2986,11 +2987,11 @@ fname:
     vmull.u8  q1, d0, d28
     vmlal.u8  q1, d1, d29
     /* 5 cycles bubble */
-    vshll.u16 q0, d2, #8
+    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q0, d2, d30
     vmlal.u16 q0, d3, d30
     /* 5 cycles bubble */
-    vshrn.u32 d0, q0, #16
+    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
     /* 3 cycles bubble */
     vmovn.u16 d0, q0
     /* 1 cycle bubble */
@@ -3000,15 +3001,15 @@ fname:
 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
     bilinear_load_and_vertical_interpolate_two_&src_fmt \
                 q1, q11, d0, d1, d20, d21, d22, d23
-    vshll.u16 q0, d2, #8
+    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q0, d2, d30
     vmlal.u16 q0, d3, d30
-    vshll.u16 q10, d22, #8
+    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q10, d22, d31
     vmlal.u16 q10, d23, d31
-    vshrn.u32 d0, q0, #16
-    vshrn.u32 d1, q10, #16
-    vshr.u16  q15, q12, #8
+    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vadd.u16  q12, q12, q13
     vmovn.u16 d0, q0
     bilinear_store_&dst_fmt 2, q2, q3
@@ -3020,26 +3021,26 @@ fname:
                 q3, q9,  d4, d5, d16, d17, d18, d19
     pld       [TMP1, PF_OFFS]
     sub       TMP1, TMP1, STRIDE
-    vshll.u16 q0, d2, #8
+    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q0, d2, d30
     vmlal.u16 q0, d3, d30
-    vshll.u16 q10, d22, #8
+    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q10, d22, d31
     vmlal.u16 q10, d23, d31
-    vshr.u16  q15, q12, #8
-    vshll.u16 q2, d6, #8
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q2, d6, d30
     vmlal.u16 q2, d7, d30
-    vshll.u16 q8, d18, #8
+    vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
     pld       [TMP2, PF_OFFS]
     vmlsl.u16 q8, d18, d31
     vmlal.u16 q8, d19, d31
     vadd.u16  q12, q12, q13
-    vshrn.u32 d0, q0, #16
-    vshrn.u32 d1, q10, #16
-    vshrn.u32 d4, q2, #16
-    vshrn.u32 d5, q8, #16
-    vshr.u16  q15, q12, #8
+    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vmovn.u16 d0, q0
     vmovn.u16 d1, q2
     vadd.u16  q12, q12, q13
@@ -3158,13 +3159,13 @@ pixman_asm_function fname
     blt       0f
     tst       OUT, #(1 << dst_bpp_shift)
     beq       0f
-    vshr.u16  q15, q12, #8
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vadd.u16  q12, q12, q13
     bilinear_interpolate_last_pixel src_fmt, dst_fmt
     sub       WIDTH, WIDTH, #1
 0:
     vadd.u16  q13, q13, q13
-    vshr.u16  q15, q12, #8
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vadd.u16  q12, q12, q13
 
     cmp       WIDTH, #2
@@ -3282,7 +3283,7 @@ pixman_asm_function fname
     vmull.u8  q10, d22, d28
     vmlal.u8  q10, d23, d29
 
-    vshll.u16 q0, d16, #8
+    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q0, d16, d30
     vmlal.u16 q0, d17, d30
 
@@ -3293,25 +3294,25 @@ pixman_asm_function fname
     vmull.u8  q11, d16, d28
     vmlal.u8  q11, d17, d29
 
-    vshll.u16 q1, d18, #8
+    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q1, d18, d31
 .endm
 
 .macro bilinear_interpolate_four_pixels_8888_8888_tail
     vmlal.u16 q1, d19, d31
-    vshr.u16  q15, q12, #8
-    vshll.u16 q2, d20, #8
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q2, d20, d30
     vmlal.u16 q2, d21, d30
-    vshll.u16 q3, d22, #8
+    vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q3, d22, d31
     vmlal.u16 q3, d23, d31
     vadd.u16  q12, q12, q13
-    vshrn.u32 d0, q0, #16
-    vshrn.u32 d1, q1, #16
-    vshrn.u32 d4, q2, #16
-    vshr.u16  q15, q12, #8
-    vshrn.u32 d5, q3, #16
+    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
     vmovn.u16 d6, q0
     vmovn.u16 d7, q2
     vadd.u16  q12, q12, q13
@@ -3326,22 +3327,22 @@ pixman_asm_function fname
     add       X, X, UX
     add       TMP2, TOP, TMP2, asl #2
         vmlal.u16 q1, d19, d31
-        vshr.u16  q15, q12, #8
-        vshll.u16 q2, d20, #8
+        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
         vmlsl.u16 q2, d20, d30
         vmlal.u16 q2, d21, d30
-        vshll.u16 q3, d22, #8
+        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
     vld1.32   {d20}, [TMP1], STRIDE
         vmlsl.u16 q3, d22, d31
         vmlal.u16 q3, d23, d31
     vld1.32   {d21}, [TMP1]
     vmull.u8  q8, d20, d28
     vmlal.u8  q8, d21, d29
-        vshrn.u32 d0, q0, #16
-        vshrn.u32 d1, q1, #16
-        vshrn.u32 d4, q2, #16
+        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
     vld1.32   {d22}, [TMP2], STRIDE
-        vshrn.u32 d5, q3, #16
+        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
         vadd.u16  q12, q12, q13
     vld1.32   {d23}, [TMP2]
     vmull.u8  q9, d22, d28
@@ -3353,12 +3354,12 @@ pixman_asm_function fname
     add       TMP4, TOP, TMP4, asl #2
     vmlal.u8  q9, d23, d29
     vld1.32   {d22}, [TMP3], STRIDE
-        vshr.u16  q15, q12, #8
+        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vld1.32   {d23}, [TMP3]
     vmull.u8  q10, d22, d28
     vmlal.u8  q10, d23, d29
         vmovn.u16 d6, q0
-    vshll.u16 q0, d16, #8
+    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
         vmovn.u16 d7, q2
     vmlsl.u16 q0, d16, d30
     vmlal.u16 q0, d17, d30
@@ -3370,7 +3371,7 @@ pixman_asm_function fname
     vmull.u8  q11, d16, d28
     vmlal.u8  q11, d17, d29
         vst1.32   {d6, d7}, [OUT, :128]!
-    vshll.u16 q1, d18, #8
+    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q1, d18, d31
 .endm
 
@@ -3403,7 +3404,7 @@ pixman_asm_function fname
     vld1.32   {d23}, [TMP3]
     vmull.u8  q10, d22, d28
     vmlal.u8  q10, d23, d29
-    vshll.u16 q0, d16, #8
+    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q0, d16, d30
     vmlal.u16 q0, d17, d30
     pld       [TMP4, PF_OFFS]
@@ -3412,7 +3413,7 @@ pixman_asm_function fname
     pld       [TMP4, PF_OFFS]
     vmull.u8  q11, d16, d28
     vmlal.u8  q11, d17, d29
-    vshll.u16 q1, d18, #8
+    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q1, d18, d31
 
     mov       TMP1, X, asr #16
@@ -3422,22 +3423,22 @@ pixman_asm_function fname
     add       X, X, UX
     add       TMP2, TOP, TMP2, asl #2
         vmlal.u16 q1, d19, d31
-        vshr.u16  q15, q12, #8
-        vshll.u16 q2, d20, #8
+        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
         vmlsl.u16 q2, d20, d30
         vmlal.u16 q2, d21, d30
-        vshll.u16 q3, d22, #8
+        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
     vld1.32   {d20}, [TMP1], STRIDE
         vmlsl.u16 q3, d22, d31
         vmlal.u16 q3, d23, d31
     vld1.32   {d21}, [TMP1]
     vmull.u8  q8, d20, d28
     vmlal.u8  q8, d21, d29
-        vshrn.u32 d0, q0, #16
-        vshrn.u32 d1, q1, #16
-        vshrn.u32 d4, q2, #16
+        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
     vld1.32   {d22}, [TMP2], STRIDE
-        vshrn.u32 d5, q3, #16
+        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
         vadd.u16  q12, q12, q13
     vld1.32   {d23}, [TMP2]
     vmull.u8  q9, d22, d28
@@ -3449,12 +3450,12 @@ pixman_asm_function fname
     add       TMP4, TOP, TMP4, asl #2
     vmlal.u8  q9, d23, d29
     vld1.32   {d22}, [TMP3], STRIDE
-        vshr.u16  q15, q12, #8
+        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vld1.32   {d23}, [TMP3]
     vmull.u8  q10, d22, d28
     vmlal.u8  q10, d23, d29
         vmovn.u16 d8, q0
-    vshll.u16 q0, d16, #8
+    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
         vmovn.u16 d9, q2
     vmlsl.u16 q0, d16, d30
     vmlal.u16 q0, d17, d30
@@ -3465,25 +3466,25 @@ pixman_asm_function fname
     pld       [TMP4, PF_OFFS]
     vmull.u8  q11, d16, d28
     vmlal.u8  q11, d17, d29
-    vshll.u16 q1, d18, #8
+    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q1, d18, d31
 .endm
 
 .macro bilinear_interpolate_eight_pixels_8888_0565_tail
     vmlal.u16 q1, d19, d31
-    vshr.u16  q15, q12, #8
-    vshll.u16 q2, d20, #8
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q2, d20, d30
     vmlal.u16 q2, d21, d30
-    vshll.u16 q3, d22, #8
+    vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q3, d22, d31
     vmlal.u16 q3, d23, d31
     vadd.u16  q12, q12, q13
-    vshrn.u32 d0, q0, #16
-    vshrn.u32 d1, q1, #16
-    vshrn.u32 d4, q2, #16
-    vshr.u16  q15, q12, #8
-    vshrn.u32 d5, q3, #16
+    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
     vmovn.u16 d10, q0
     vmovn.u16 d11, q2
     vadd.u16  q12, q12, q13
@@ -3508,23 +3509,23 @@ pixman_asm_function fname
     add       X, X, UX
     add       TMP2, TOP, TMP2, asl #2
         vmlal.u16 q1, d19, d31
-        vshr.u16  q15, q12, #8
+        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
             vuzp.u8 d8, d9
-        vshll.u16 q2, d20, #8
+        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
         vmlsl.u16 q2, d20, d30
         vmlal.u16 q2, d21, d30
-        vshll.u16 q3, d22, #8
+        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
     vld1.32   {d20}, [TMP1], STRIDE
         vmlsl.u16 q3, d22, d31
         vmlal.u16 q3, d23, d31
     vld1.32   {d21}, [TMP1]
     vmull.u8  q8, d20, d28
     vmlal.u8  q8, d21, d29
-        vshrn.u32 d0, q0, #16
-        vshrn.u32 d1, q1, #16
-        vshrn.u32 d4, q2, #16
+        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
     vld1.32   {d22}, [TMP2], STRIDE
-        vshrn.u32 d5, q3, #16
+        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
         vadd.u16  q12, q12, q13
     vld1.32   {d23}, [TMP2]
     vmull.u8  q9, d22, d28
@@ -3536,12 +3537,12 @@ pixman_asm_function fname
     add       TMP4, TOP, TMP4, asl #2
     vmlal.u8  q9, d23, d29
     vld1.32   {d22}, [TMP3], STRIDE
-        vshr.u16  q15, q12, #8
+        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vld1.32   {d23}, [TMP3]
     vmull.u8  q10, d22, d28
     vmlal.u8  q10, d23, d29
         vmovn.u16 d10, q0
-    vshll.u16 q0, d16, #8
+    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
         vmovn.u16 d11, q2
     vmlsl.u16 q0, d16, d30
     vmlal.u16 q0, d17, d30
@@ -3553,7 +3554,7 @@ pixman_asm_function fname
     vmull.u8  q11, d16, d28
     vmlal.u8  q11, d17, d29
             vuzp.u8 d10, d11
-    vshll.u16 q1, d18, #8
+    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q1, d18, d31
 
     mov       TMP1, X, asr #16
@@ -3564,12 +3565,12 @@ pixman_asm_function fname
     add       TMP2, TOP, TMP2, asl #2
         vmlal.u16 q1, d19, d31
             vuzp.u8 d9, d11
-        vshr.u16  q15, q12, #8
-        vshll.u16 q2, d20, #8
+        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
             vuzp.u8 d8, d10
         vmlsl.u16 q2, d20, d30
         vmlal.u16 q2, d21, d30
-        vshll.u16 q3, d22, #8
+        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
     vld1.32   {d20}, [TMP1], STRIDE
         vmlsl.u16 q3, d22, d31
         vmlal.u16 q3, d23, d31
@@ -3579,13 +3580,13 @@ pixman_asm_function fname
             vshll.u8  q6, d9, #8
             vshll.u8  q5, d10, #8
             vshll.u8  q7, d8, #8
-        vshrn.u32 d0, q0, #16
+        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
             vsri.u16  q5, q6, #5
-        vshrn.u32 d1, q1, #16
+        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
             vsri.u16  q5, q7, #11
-        vshrn.u32 d4, q2, #16
+        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
     vld1.32   {d22}, [TMP2], STRIDE
-        vshrn.u32 d5, q3, #16
+        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
         vadd.u16  q12, q12, q13
     vld1.32   {d23}, [TMP2]
     vmull.u8  q9, d22, d28
@@ -3597,12 +3598,12 @@ pixman_asm_function fname
     add       TMP4, TOP, TMP4, asl #2
     vmlal.u8  q9, d23, d29
     vld1.32   {d22}, [TMP3], STRIDE
-        vshr.u16  q15, q12, #8
+        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vld1.32   {d23}, [TMP3]
     vmull.u8  q10, d22, d28
     vmlal.u8  q10, d23, d29
         vmovn.u16 d8, q0
-    vshll.u16 q0, d16, #8
+    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
         vmovn.u16 d9, q2
     vmlsl.u16 q0, d16, d30
     vmlal.u16 q0, d17, d30
@@ -3613,7 +3614,7 @@ pixman_asm_function fname
     pld       [TMP4, PF_OFFS]
     vmull.u8  q11, d16, d28
     vmlal.u8  q11, d17, d29
-    vshll.u16 q1, d18, #8
+    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
             vst1.32   {d10, d11}, [OUT, :128]!
     vmlsl.u16 q1, d18, d31
 .endm
diff --git a/pixman/pixman/pixman-bits-image.c b/pixman/pixman/pixman-bits-image.c
index 05eab9634..b6c8630f4 100644
--- a/pixman/pixman/pixman-bits-image.c
+++ b/pixman/pixman/pixman-bits-image.c
@@ -131,8 +131,8 @@ bits_image_fetch_pixel_bilinear (bits_image_t   *image,
     x1 = x - pixman_fixed_1 / 2;
     y1 = y - pixman_fixed_1 / 2;
 
-    distx = (x1 >> 8) & 0xff;
-    disty = (y1 >> 8) & 0xff;
+    distx = pixman_fixed_to_bilinear_weight (x1);
+    disty = pixman_fixed_to_bilinear_weight (y1);
 
     x1 = pixman_fixed_to_int (x1);
     y1 = pixman_fixed_to_int (y1);
@@ -200,7 +200,7 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter,
     x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;
 
     y = v.vector[1] - pixman_fixed_1/2;
-    disty = (y >> 8) & 0xff;
+    disty = pixman_fixed_to_bilinear_weight (y);
 
     /* Load the pointers to the first and second lines from the source
      * image that bilinear code must read.
@@ -309,7 +309,7 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter,
 	tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask;
 	br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
 
-	distx = (x >> 8) & 0xff;
+	distx = pixman_fixed_to_bilinear_weight (x);
 
 	*buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty);
 
@@ -334,7 +334,7 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter,
 	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
 	    br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
 
-	    distx = (x >> 8) & 0xff;
+	    distx = pixman_fixed_to_bilinear_weight (x);
 
 	    *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty);
 	}
@@ -358,7 +358,7 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter,
 	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
 	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
 
-	    distx = (x >> 8) & 0xff;
+	    distx = pixman_fixed_to_bilinear_weight (x);
 
 	    *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty);
 	}
@@ -695,8 +695,8 @@ bits_image_fetch_bilinear_affine (pixman_image_t * image,
 	x1 = x - pixman_fixed_1 / 2;
 	y1 = y - pixman_fixed_1 / 2;
 
-	distx = (x1 >> 8) & 0xff;
-	disty = (y1 >> 8) & 0xff;
+	distx = pixman_fixed_to_bilinear_weight (x1);
+	disty = pixman_fixed_to_bilinear_weight (y1);
 
 	y1 = pixman_fixed_to_int (y1);
 	y2 = y1 + 1;
diff --git a/pixman/pixman/pixman-inlines.h b/pixman/pixman/pixman-inlines.h
index 3532867a4..5517de5a5 100644
--- a/pixman/pixman/pixman-inlines.h
+++ b/pixman/pixman/pixman-inlines.h
@@ -81,6 +81,13 @@ repeat (pixman_repeat_t repeat, int *c, int size)
     return TRUE;
 }
 
+static force_inline int
+pixman_fixed_to_bilinear_weight (pixman_fixed_t x)
+{
+    return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) &
+	   ((1 << BILINEAR_INTERPOLATION_BITS) - 1);
+}
+
 #if SIZEOF_LONG > 4
 
 static force_inline uint32_t
@@ -92,6 +99,9 @@ bilinear_interpolation (uint32_t tl, uint32_t tr,
     uint64_t tl64, tr64, bl64, br64;
     uint64_t f, r;
 
+    distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
+    disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
+
     distxy = distx * disty;
     distxiy = distx * (256 - disty);
     distixy = (256 - distx) * disty;
@@ -135,6 +145,9 @@ bilinear_interpolation (uint32_t tl, uint32_t tr,
     int distxy, distxiy, distixy, distixiy;
     uint32_t f, r;
 
+    distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
+    disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
+
     distxy = distx * disty;
     distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */
     distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */
@@ -758,12 +771,14 @@ bilinear_pad_repeat_get_scanline_bounds (int32_t         source_image_width,
  *                        all source pixels are fetched from zero padding
  *                        zone for NONE repeat
  *
- * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to 256,
- *       but sometimes it may be less than that for NONE repeat when handling
- *       fuzzy antialiased top or bottom image edges. Also both top and
- *       bottom weight variables are guaranteed to have value in 0-255
- *       range and can fit into unsigned byte or be used with 8-bit SIMD
- *       multiplication instructions.
+ * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to
+ *       BILINEAR_INTERPOLATION_RANGE, but sometimes it may be less than that
+ *       for NONE repeat when handling fuzzy antialiased top or bottom image
+ *       edges. Also both top and bottom weight variables are guaranteed to
+ *       have value, which is less than BILINEAR_INTERPOLATION_RANGE.
+ *       For example, the weights can fit into unsigned byte or be used
+ *       with 8-bit SIMD multiplication instructions for 8-bit interpolation
+ *       precision.
  */
 #define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
 				  dst_type_t, repeat_mode, flags)				\
@@ -877,18 +892,18 @@ fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp,
 	}											\
 												\
 	y1 = pixman_fixed_to_int (vy);								\
-	weight2 = (vy >> 8) & 0xff;								\
+	weight2 = pixman_fixed_to_bilinear_weight (vy);						\
 	if (weight2)										\
 	{											\
-	    /* normal case, both row weights are in 0-255 range and fit unsigned byte */	\
+	    /* both weight1 and weight2 are smaller than BILINEAR_INTERPOLATION_RANGE */	\
 	    y2 = y1 + 1;									\
-	    weight1 = 256 - weight2;								\
+	    weight1 = BILINEAR_INTERPOLATION_RANGE - weight2;					\
 	}											\
 	else											\
 	{											\
-	    /* set both top and bottom row to the same scanline, and weights to 128+128 */	\
+	    /* set both top and bottom row to the same scanline and tweak weights */		\
 	    y2 = y1;										\
-	    weight1 = weight2 = 128;								\
+	    weight1 = weight2 = BILINEAR_INTERPOLATION_RANGE / 2;				\
 	}											\
 	vy += unit_y;										\
 	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
diff --git a/pixman/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman/pixman-mips-dspr2-asm.S
index 87558f032..48f108ed9 100644
--- a/pixman/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman/pixman-mips-dspr2-asm.S
@@ -29,6 +29,7 @@
  * Author:  Nemanja Lukic (nlukic@mips.com)
  */
 
+#include "pixman-private.h"
 #include "pixman-mips-dspr2-asm.h"
 
 LEAF_MIPS_DSPR2(pixman_fill_buff16_mips)
@@ -771,11 +772,15 @@ LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_mips)
     lw       s1, 48(sp)        /* s1 = wb */
     lw       s2, 52(sp)        /* s2 = vx */
     lw       s3, 56(sp)        /* s3 = unit_x */
-    li       v0, 256
+    li       v0, BILINEAR_INTERPOLATION_RANGE
     li       s8, 0x00ff00ff
+
+    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+
 0:
     andi     t4, s2, 0xffff    /* t4 = (short)vx */
-    srl      t4, t4, 8         /* t4 = vx >> 8 */
+    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
     subu     t5, v0, t4        /* t5 = ( 256 - (vx>>8)) */
 
     mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */
diff --git a/pixman/pixman/pixman-mmx.c b/pixman/pixman/pixman-mmx.c
index d869c04c6..5441d6bc2 100644
--- a/pixman/pixman/pixman-mmx.c
+++ b/pixman/pixman/pixman-mmx.c
@@ -42,6 +42,7 @@
 #endif
 #include "pixman-private.h"
 #include "pixman-combine32.h"
+#include "pixman-inlines.h"
 
 #define no_vERBOSE
 
@@ -694,6 +695,24 @@ combine (const uint32_t *src, const uint32_t *mask)
     return vsrc;
 }
 
+static force_inline __m64
+core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
+{
+    vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
+
+    if (is_opaque (vsrc))
+    {
+	return vsrc;
+    }
+    else if (!is_zero (vsrc))
+    {
+	return over (vsrc, expand_alpha (vsrc),
+		     _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
+    }
+
+    return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
+}
+
 static void
 mmx_combine_over_u (pixman_implementation_t *imp,
                     pixman_op_t              op,
@@ -1599,9 +1618,7 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
-    mask &= 0xff000000;
-    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
-    vmask = load8888 (&mask);
+    vmask = expand_alpha (load8888 (&mask));
 
     while (height--)
     {
@@ -1670,9 +1687,7 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
 
-    mask &= 0xff000000;
-    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
-    vmask = load8888 (&mask);
+    vmask = expand_alpha (load8888 (&mask));
     srca = MC (4x00ff);
 
     while (height--)
@@ -3506,6 +3521,242 @@ mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
     _mm_empty ();
 }
 
+#define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
+#define BMSK (BSHIFT - 1)
+
+#define BILINEAR_DECLARE_VARIABLES						\
+    const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);				\
+    const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);				\
+    const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT);	\
+    const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1);				\
+    const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK);			\
+    const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);		\
+    const __m64 mm_zero = _mm_setzero_si64 ();					\
+    __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
+do {										\
+    /* fetch 2x2 pixel block into 2 mmx registers */				\
+    __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]);		\
+    __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]);		\
+    vx += unit_x;								\
+    /* vertical interpolation */						\
+    __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt);		\
+    __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt);		\
+    __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb);		\
+    __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);		\
+    __m64 hi = _mm_add_pi16 (t_hi, b_hi);					\
+    __m64 lo = _mm_add_pi16 (t_lo, b_lo);					\
+    if (BILINEAR_INTERPOLATION_BITS < 8)					\
+    {										\
+	/* calculate horizontal weights */					\
+	__m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7,		\
+			  _mm_srli_pi16 (mm_x,					\
+					 16 - BILINEAR_INTERPOLATION_BITS)));	\
+	mm_x = _mm_add_pi16 (mm_x, mm_ux);					\
+	/* horizontal interpolation */						\
+	__m64 p = _mm_unpacklo_pi16 (lo, hi);					\
+	__m64 q = _mm_unpackhi_pi16 (lo, hi);					\
+	lo = _mm_madd_pi16 (p, mm_wh);						\
+	hi = _mm_madd_pi16 (q, mm_wh);						\
+    }										\
+    else									\
+    {										\
+	/* calculate horizontal weights */					\
+	__m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x,		\
+					16 - BILINEAR_INTERPOLATION_BITS));	\
+	__m64 mm_wh_hi = _mm_srli_pi16 (mm_x,					\
+					16 - BILINEAR_INTERPOLATION_BITS);	\
+	mm_x = _mm_add_pi16 (mm_x, mm_ux);					\
+	/* horizontal interpolation */						\
+	__m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo);				\
+	__m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi);				\
+	__m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo);				\
+	__m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi);				\
+	lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo),		\
+			   _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi));		\
+	hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo),		\
+			   _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi));		\
+    }										\
+    /* shift and pack the result */						\
+    hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2);			\
+    lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2);			\
+    lo = _mm_packs_pi32 (lo, hi);						\
+    lo = _mm_packs_pu16 (lo, lo);						\
+    pix = lo;									\
+} while (0)
+
+#define BILINEAR_SKIP_ONE_PIXEL()						\
+do {										\
+    vx += unit_x;								\
+    mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
+} while(0)
+
+static force_inline void
+scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t *       dst,
+					    const uint32_t * mask,
+					    const uint32_t * src_top,
+					    const uint32_t * src_bottom,
+					    int32_t          w,
+					    int              wt,
+					    int              wb,
+					    pixman_fixed_t   vx,
+					    pixman_fixed_t   unit_x,
+					    pixman_fixed_t   max_vx,
+					    pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    __m64 pix;
+
+    while (w--)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
+	store (dst, pix);
+	dst++;
+    }
+
+    _mm_empty ();
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
+			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
+			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
+			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
+			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t *       dst,
+					     const uint32_t * mask,
+					     const uint32_t * src_top,
+					     const uint32_t * src_bottom,
+					     int32_t          w,
+					     int              wt,
+					     int              wb,
+					     pixman_fixed_t   vx,
+					     pixman_fixed_t   unit_x,
+					     pixman_fixed_t   max_vx,
+					     pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    __m64 pix1, pix2;
+
+    while (w)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+	if (!is_zero (pix1))
+	{
+	    pix2 = load (dst);
+	    store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
+	}
+
+	w--;
+	dst++;
+    }
+
+    _mm_empty ();
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t *       dst,
+					       const uint8_t  * mask,
+					       const uint32_t * src_top,
+					       const uint32_t * src_bottom,
+					       int32_t          w,
+					       int              wt,
+					       int              wb,
+					       pixman_fixed_t   vx,
+					       pixman_fixed_t   unit_x,
+					       pixman_fixed_t   max_vx,
+					       pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    __m64 pix1, pix2;
+    uint32_t m;
+
+    while (w)
+    {
+	m = (uint32_t) *mask++;
+
+	if (m)
+	{
+	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+	    if (m == 0xff && is_opaque (pix1))
+	    {
+		store (dst, pix1);
+	    }
+	    else
+	    {
+		__m64 ms, md, ma, msa;
+
+		pix2 = load (dst);
+		ma = expand_alpha_rev (to_m64 (m));
+		ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
+		md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
+
+		msa = expand_alpha (ms);
+
+		store8888 (dst, (in_over (ms, msa, ma, md)));
+	    }
+	}
+	else
+	{
+	    BILINEAR_SKIP_ONE_PIXEL ();
+	}
+
+	w--;
+	dst++;
+    }
+
+    _mm_empty ();
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       COVER, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       PAD, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       NONE, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
+
 static uint32_t *
 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
 {
@@ -3761,6 +4012,23 @@ static const pixman_fast_path_t mmx_fast_paths[] =
     PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
     PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
 
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          a8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          a8b8g8r8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
+
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         x8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         x8b8g8r8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         a8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         a8b8g8r8, mmx_8888_8888                     ),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888                   ),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888                   ),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888                   ),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888                   ),
+
     { PIXMAN_OP_NONE },
 };
 
diff --git a/pixman/pixman/pixman-private.h b/pixman/pixman/pixman-private.h
index 72e3b4f6d..0c27798b0 100644
--- a/pixman/pixman/pixman-private.h
+++ b/pixman/pixman/pixman-private.h
@@ -1,10 +1,24 @@
+#ifndef PIXMAN_PRIVATE_H
+#define PIXMAN_PRIVATE_H
+
+/*
+ * The defines which are shared between C and assembly code
+ */
+
+/* bilinear interpolation precision (must be <= 8) */
+#define BILINEAR_INTERPOLATION_BITS 7
+#define BILINEAR_INTERPOLATION_RANGE (1 << BILINEAR_INTERPOLATION_BITS)
+
+/*
+ * C specific part
+ */
+
+#ifndef __ASSEMBLER__
+
 #ifndef PACKAGE
 #  error config.h must be included before pixman-private.h
 #endif
 
-#ifndef PIXMAN_PRIVATE_H
-#define PIXMAN_PRIVATE_H
-
 #define PIXMAN_DISABLE_DEPRECATED
 #define PIXMAN_USE_INTERNAL_API
 
@@ -1052,4 +1066,6 @@ void pixman_timer_register (pixman_timer_t *timer);
 
 #endif /* PIXMAN_TIMERS */
 
+#endif /* __ASSEMBLER__ */
+
 #endif /* PIXMAN_PRIVATE_H */
diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c
index ef82a18c3..665eeadbe 100644
--- a/pixman/pixman/pixman-sse2.c
+++ b/pixman/pixman/pixman-sse2.c
@@ -5364,11 +5364,15 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
 			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
 
+#define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1)
+
 #define BILINEAR_DECLARE_VARIABLES						\
     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
-    const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);\
-    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\
+    const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\
+    const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\
+    const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, BMSK);\
+    const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
     const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,	\
 					  unit_x, unit_x, unit_x, unit_x);	\
     const __m128i xmm_zero = _mm_setzero_si128 ();				\
@@ -5388,18 +5392,30 @@ do {										\
 					xmm_wt),				\
 		       _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero),	\
 					xmm_wb));				\
-    /* calculate horizontal weights */						\
-    xmm_wh = _mm_add_epi16 (xmm_addc,						\
-			    _mm_xor_si128 (xmm_xorc,				\
-					   _mm_srli_epi16 (xmm_x, 8)));		\
-    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
-    /* horizontal interpolation */						\
-    xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\
-    xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);					\
-    a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),			\
-		       _mm_unpackhi_epi16 (xmm_lo, xmm_hi));			\
+    if (BILINEAR_INTERPOLATION_BITS < 8)					\
+    {										\
+	/* calculate horizontal weights */					\
+	xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7,		\
+		   _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));	\
+	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
+	/* horizontal interpolation */						\
+	a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\
+		a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh);			\
+    }										\
+    else									\
+    {										\
+	/* calculate horizontal weights */					\
+	xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8,		\
+		_mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));	\
+	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
+	/* horizontal interpolation */						\
+	xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\
+	xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);					\
+	a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),			\
+			   _mm_unpackhi_epi16 (xmm_lo, xmm_hi));		\
+    }										\
     /* shift and pack the result */						\
-    a = _mm_srli_epi32 (a, 16);							\
+    a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2);			\
     a = _mm_packs_epi32 (a, a);							\
     a = _mm_packus_epi16 (a, a);						\
     pix = _mm_cvtsi128_si32 (a);						\
@@ -5845,6 +5861,9 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
 
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
diff --git a/pixman/test/affine-test.c b/pixman/test/affine-test.c
index a4ceed3da..6827cc3a8 100644
--- a/pixman/test/affine-test.c
+++ b/pixman/test/affine-test.c
@@ -301,11 +301,21 @@ test_composite (int      testnum,
     return crc32;
 }
 
+#if BILINEAR_INTERPOLATION_BITS == 8
+#define CHECKSUM 0x1EF2175A
+#elif BILINEAR_INTERPOLATION_BITS == 7
+#define CHECKSUM 0x74050F50
+#elif BILINEAR_INTERPOLATION_BITS == 4
+#define CHECKSUM 0x4362EAE8
+#else
+#define CHECKSUM 0x00000000
+#endif
+
 int
 main (int argc, const char *argv[])
 {
     pixman_disable_out_of_bounds_workaround ();
 
-    return fuzzer_test_main ("affine", 8000000, 0x1EF2175A,
+    return fuzzer_test_main ("affine", 8000000, CHECKSUM,
 			     test_composite, argc, argv);
 }
diff --git a/pixman/test/scaling-test.c b/pixman/test/scaling-test.c
index 6f2da1432..44c4f3de4 100644
--- a/pixman/test/scaling-test.c
+++ b/pixman/test/scaling-test.c
@@ -357,11 +357,21 @@ test_composite (int      testnum,
     return crc32;
 }
 
+#if BILINEAR_INTERPOLATION_BITS == 8
+#define CHECKSUM 0x80DF1CB2
+#elif BILINEAR_INTERPOLATION_BITS == 7
+#define CHECKSUM 0x2818D5FB
+#elif BILINEAR_INTERPOLATION_BITS == 4
+#define CHECKSUM 0x387540A5
+#else
+#define CHECKSUM 0x00000000
+#endif
+
 int
 main (int argc, const char *argv[])
 {
     pixman_disable_out_of_bounds_workaround ();
 
-    return fuzzer_test_main("scaling", 8000000, 0x80DF1CB2,
+    return fuzzer_test_main("scaling", 8000000, CHECKSUM,
 			    test_composite, argc, argv);
 }
diff --git a/xorg-server/randr/rrscreen.c b/xorg-server/randr/rrscreen.c
index 55110e088..c564d1f96 100644
--- a/xorg-server/randr/rrscreen.c
+++ b/xorg-server/randr/rrscreen.c
@@ -195,7 +195,7 @@ ProcRRGetScreenSizeRange(ClientPtr client)
     rrScrPrivPtr pScrPriv;
     int rc;
 
-    REQUEST_SIZE_MATCH(xRRGetScreenInfoReq);
+    REQUEST_SIZE_MATCH(xRRGetScreenSizeRangeReq);
     rc = dixLookupWindow(&pWin, stuff->window, client, DixGetAttrAccess);
     if (rc != Success)
         return rc;
-- 
cgit v1.2.3