diff options
| author | marha <marha@users.sourceforge.net> | 2012-07-02 08:51:35 +0200 | 
|---|---|---|
| committer | marha <marha@users.sourceforge.net> | 2012-07-02 08:51:35 +0200 | 
| commit | fcc2cc7dabb46c39a76351fc12da4e9ad9d1d817 (patch) | |
| tree | bcc60a76e7cc205710b91c67ff700c26886ea3e8 | |
| parent | 393178cdbca247c6ad077f7dab9a97d6817c625c (diff) | |
| parent | fdef5bff99e6079f64bc6b91c91b42195c85adeb (diff) | |
| download | vcxsrv-fcc2cc7dabb46c39a76351fc12da4e9ad9d1d817.tar.gz vcxsrv-fcc2cc7dabb46c39a76351fc12da4e9ad9d1d817.tar.bz2 vcxsrv-fcc2cc7dabb46c39a76351fc12da4e9ad9d1d817.zip | |
Merge remote-tracking branch 'origin/released'
Conflicts:
	pixman/pixman/pixman-sse2.c
26 files changed, 898 insertions, 427 deletions
| diff --git a/mesalib/src/gallium/auxiliary/Android.mk b/mesalib/src/gallium/auxiliary/Android.mk index 0c37dd31a..11fc2256a 100644 --- a/mesalib/src/gallium/auxiliary/Android.mk +++ b/mesalib/src/gallium/auxiliary/Android.mk @@ -44,8 +44,7 @@ $(LOCAL_GENERATED_SOURCES): PRIVATE_CUSTOM_TOOL = $(PRIVATE_PYTHON) $^ > $@  $(intermediates)/indices/u_indices_gen.c \  $(intermediates)/indices/u_unfilled_gen.c \ -$(intermediates)/util/u_format_srgb.c \ -$(intermediates)/util/u_half.c: $(intermediates)/%.c: $(LOCAL_PATH)/%.py +$(intermediates)/util/u_format_srgb.c: $(intermediates)/%.c: $(LOCAL_PATH)/%.py  	$(transform-generated-source)  $(intermediates)/util/u_format_table.c: $(intermediates)/%.c: $(LOCAL_PATH)/%.py $(LOCAL_PATH)/util/u_format.csv diff --git a/mesalib/src/gallium/auxiliary/Makefile b/mesalib/src/gallium/auxiliary/Makefile index a70ae7384..3ba3f9c40 100644 --- a/mesalib/src/gallium/auxiliary/Makefile +++ b/mesalib/src/gallium/auxiliary/Makefile @@ -39,6 +39,4 @@ util/u_format_srgb.c: util/u_format_srgb.py  util/u_format_table.c: util/u_format_table.py util/u_format_pack.py util/u_format_parse.py util/u_format.csv  	$(PYTHON2) util/u_format_table.py util/u_format.csv > $@ -util/u_half.c: util/u_half.py -	$(PYTHON2) util/u_half.py > $@  # DO NOT DELETE diff --git a/mesalib/src/gallium/auxiliary/Makefile.sources b/mesalib/src/gallium/auxiliary/Makefile.sources index 277428b38..28a176d68 100644 --- a/mesalib/src/gallium/auxiliary/Makefile.sources +++ b/mesalib/src/gallium/auxiliary/Makefile.sources @@ -155,8 +155,7 @@ GENERATED_SOURCES := \  	indices/u_indices_gen.c \  	indices/u_unfilled_gen.c \  	util/u_format_srgb.c \ -	util/u_format_table.c \ -	util/u_half.c +	util/u_format_table.c  GALLIVM_SOURCES := \          gallivm/lp_bld_arit.c \ diff --git a/mesalib/src/gallium/auxiliary/SConscript b/mesalib/src/gallium/auxiliary/SConscript index 07c420e13..bfd5ec34c 100644 --- a/mesalib/src/gallium/auxiliary/SConscript +++ b/mesalib/src/gallium/auxiliary/SConscript @@ -35,13 +35,6 @@ env.CodeGenerate(      command = python_cmd + ' $SCRIPT $SOURCE > $TARGET'  ) -env.CodeGenerate( -    target = 'util/u_half.c', -    script = 'util/u_half.py', -    source = [], -    command = python_cmd + ' $SCRIPT > $TARGET' -) -  env.Depends('util/u_format_table.c', [      '#src/gallium/auxiliary/util/u_format_parse.py',      'util/u_format_pack.py',  diff --git a/mesalib/src/gallium/auxiliary/util/.gitignore b/mesalib/src/gallium/auxiliary/util/.gitignore index 5dd0408ef..da74de623 100644 --- a/mesalib/src/gallium/auxiliary/util/.gitignore +++ b/mesalib/src/gallium/auxiliary/util/.gitignore @@ -1,3 +1,2 @@  u_format_srgb.c  u_format_table.c -u_half.c diff --git a/mesalib/src/gallium/auxiliary/util/u_format.c b/mesalib/src/gallium/auxiliary/util/u_format.c index cfc4a17a0..6f4529835 100644 --- a/mesalib/src/gallium/auxiliary/util/u_format.c +++ b/mesalib/src/gallium/auxiliary/util/u_format.c @@ -159,6 +159,38 @@ util_format_is_pure_uint(enum pipe_format format)  }  boolean +util_format_is_array(const struct util_format_description *desc) +{ +   unsigned chan; + +   if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN || +       desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB || +       desc->block.width != 1 || +       desc->block.height != 1) { +      return FALSE; +   } + +   for (chan = 0; chan < desc->nr_channels; ++chan) { +      if (desc->swizzle[chan] != chan) +         return FALSE; + +      if (desc->channel[chan].type != desc->channel[0].type) +         return FALSE; + +      if (desc->channel[chan].normalized != desc->channel[0].normalized) +         return FALSE; + +      if (desc->channel[chan].pure_integer != desc->channel[0].pure_integer) +         return FALSE; + +      if (desc->channel[chan].size != desc->channel[0].size) +         return FALSE; +   } + +   return TRUE; +} + +boolean  util_format_is_luminance_alpha(enum pipe_format format)  {     const struct util_format_description *desc = diff --git a/mesalib/src/gallium/auxiliary/util/u_format.h b/mesalib/src/gallium/auxiliary/util/u_format.h index 1718fb5e2..e35e164b4 100644 --- a/mesalib/src/gallium/auxiliary/util/u_format.h +++ b/mesalib/src/gallium/auxiliary/util/u_format.h @@ -591,6 +591,13 @@ boolean  util_format_is_pure_uint(enum pipe_format format);  /** + * Whether the format is a simple array format where all channels + * are of the same type and can be loaded from memory as a vector + */ +boolean +util_format_is_array(const struct util_format_description *desc); + +/**   * Check if the src format can be blitted to the destination format with   * a simple memcpy.  For example, blitting from RGBA to RGBx is OK, but not   * the reverse. diff --git a/mesalib/src/gallium/auxiliary/util/u_format_tests.c b/mesalib/src/gallium/auxiliary/util/u_format_tests.c index fc29d8d48..457fda6c8 100644 --- a/mesalib/src/gallium/auxiliary/util/u_format_tests.c +++ b/mesalib/src/gallium/auxiliary/util/u_format_tests.c @@ -26,6 +26,9 @@   **************************************************************************/ +#include <float.h> + +#include "pipe/p_config.h"  #include "u_memory.h"  #include "u_format_tests.h" @@ -63,6 +66,9 @@         {{ 0,  0,  0,  0}, { 0,  0,  0,  0}, {0, 0, 0, 0}, {0, 0, 0, 0}}} +#define NAN (0.0 / 0.0) +#define INF (1.0 / 0.0) +  /**   * Test cases.   * @@ -876,7 +882,39 @@ util_format_test_cases[] =      * Half float formats      */ -   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(  0.0, 0.0, 0.0, 1.0)}, +   /* Minimum positive normal */ +   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x0400), UNPACKED_1x1( 6.10352E-5, 0.0, 0.0, 1.0)}, + +   /* Max denormal */ +   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x03FF), UNPACKED_1x1( 6.09756E-5, 0.0, 0.0, 1.0)}, + +   /* Minimum positive denormal */ +   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x0001), UNPACKED_1x1( 5.96046E-8, 0.0, 0.0, 1.0)}, + +   /* Min representable value */ +   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xfbff), UNPACKED_1x1(   -65504.0, 0.0, 0.0, 1.0)}, + +   /* Max representable value */ +   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x7bff), UNPACKED_1x1(    65504.0, 0.0, 0.0, 1.0)}, + +#if !defined(PIPE_CC_MSVC) + +   /* NaNs */ +   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x7c01), UNPACKED_1x1(        NAN, 0.0, 0.0, 1.0)}, +   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xfc01), UNPACKED_1x1(       -NAN, 0.0, 0.0, 1.0)}, +   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x7fff), UNPACKED_1x1(        NAN, 0.0, 0.0, 1.0)}, +   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(       -NAN, 0.0, 0.0, 1.0)}, + +   /* Inf */ +   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x7c00), UNPACKED_1x1(        INF, 0.0, 0.0, 1.0)}, +   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xfc00), UNPACKED_1x1(       -INF, 0.0, 0.0, 1.0)}, + +#endif + +   /* Zero, ignore sign */ +   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0x7fff), PACKED_1x16(0x8000), UNPACKED_1x1( -0.0, 0.0, 0.0, 1.0)}, +   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0x7fff), PACKED_1x16(0x0000), UNPACKED_1x1(  0.0, 0.0, 0.0, 1.0)}, +     {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x3c00), UNPACKED_1x1(  1.0, 0.0, 0.0, 1.0)},     {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xbc00), UNPACKED_1x1( -1.0, 0.0, 0.0, 1.0)}, diff --git a/mesalib/src/gallium/auxiliary/util/u_half.h b/mesalib/src/gallium/auxiliary/util/u_half.h index ad030e90c..f7009f548 100644 --- a/mesalib/src/gallium/auxiliary/util/u_half.h +++ b/mesalib/src/gallium/auxiliary/util/u_half.h @@ -35,51 +35,84 @@  extern "C" {  #endif -extern const uint32_t util_half_to_float_mantissa_table[2048]; -extern const uint32_t util_half_to_float_exponent_table[64]; -extern const uint32_t util_half_to_float_offset_table[64]; -extern const uint16_t util_float_to_half_base_table[512]; -extern const uint8_t util_float_to_half_shift_table[512]; -  /* - * Note that if the half float is a signaling NaN, the x87 FPU will turn - * it into a quiet NaN immediately upon loading into a float. - * - * Additionally, denormals may be flushed to zero. + * References for float <-> half conversions   * - * To avoid this, use the floatui functions instead of the float ones - * when just doing conversion rather than computation on the resulting - * floats. + *  http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ + *  https://gist.github.com/2156668 + *  https://gist.github.com/2144712   */ -static INLINE uint32_t -util_half_to_floatui(uint16_t h) +static INLINE uint16_t +util_float_to_half(float f)  { -   unsigned exp = h >> 10; -   return util_half_to_float_mantissa_table[util_half_to_float_offset_table[exp] + (h & 0x3ff)] + util_half_to_float_exponent_table[exp]; +   uint32_t sign_mask  = 0x80000000; +   uint32_t round_mask = ~0xfff; +   uint32_t f32inf = 0xff << 23; +   uint32_t f16inf = 0x1f << 23; +   uint32_t sign; +   union fi magic; +   union fi f32; +   uint16_t f16; + +   magic.ui = 0xf << 23; + +   f32.f = f; + +   /* Sign */ +   sign = f32.ui & sign_mask; +   f32.ui ^= sign; + +   if (f32.ui == f32inf) { +      /* Inf */ +      f16 = 0x7c00; +   } else if (f32.ui > f32inf) { +      /* NaN */ +      f16 = 0x7e00; +   } else { +      /* Number */ +      f32.ui &= round_mask; +      f32.f  *= magic.f; +      f32.ui -= round_mask; + +      /* Clamp to infinity if overflowed */ +      if (f32.ui > f16inf) +         f32.ui = f16inf; + +      f16 = f32.ui >> 13; +   } + +   /* Sign */ +   f16 |= sign >> 16; + +   return f16;  }  static INLINE float -util_half_to_float(uint16_t h) +util_half_to_float(uint16_t f16)  { -   union fi r; -   r.ui = util_half_to_floatui(h); -   return r.f; -} +   union fi infnan; +   union fi magic; +   union fi f32; -static INLINE uint16_t -util_floatui_to_half(uint32_t v) -{ -   unsigned signexp = v >> 23; -   return util_float_to_half_base_table[signexp] + ((v & 0x007fffff) >> util_float_to_half_shift_table[signexp]); -} +   infnan.ui = 0x8f << 23; +   infnan.f = 65536.0f; +   magic.ui  = 0xef << 23; -static INLINE uint16_t -util_float_to_half(float f) -{ -   union fi i; -   i.f = f; -   return util_floatui_to_half(i.ui); +   /* Exponent / Mantissa */ +   f32.ui = (f16 & 0x7fff) << 13; + +   /* Adjust */ +   f32.f *= magic.f; + +   /* Inf / NaN */ +   if (f32.f >= infnan.f) +      f32.ui |= 0xff << 23; + +   /* Sign */ +   f32.ui |= (f16 & 0x8000) << 16; + +   return f32.f;  }  #ifdef __cplusplus diff --git a/mesalib/src/gallium/auxiliary/util/u_half.py b/mesalib/src/gallium/auxiliary/util/u_half.py deleted file mode 100644 index 915cf3b92..000000000 --- a/mesalib/src/gallium/auxiliary/util/u_half.py +++ /dev/null @@ -1,179 +0,0 @@ -# Copyright 2010 Luca Barbieri -# -# Permission is hereby granted, free of charge, to any person obtaining -# a copy of this software and associated documentation files (the -# "Software"), to deal in the Software without restriction, including -# without limitation the rights to use, copy, modify, merge, publish, -# distribute, sublicense, and/or sell copies of the Software, and to -# permit persons to whom the Software is furnished to do so, subject to -# the following conditions: -# -# The above copyright notice and this permission notice (including the -# next paragraph) shall be included in all copies or substantial -# portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -# IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE -# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# ************************************************************************* - -# The code is a reimplementation of the algorithm in -#  www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf -# "Fast Half Float Conversions" by Jeroen van der Zijp, Nov 2008 -# -# The table contents have been slightly changed so that the exponent -# bias is now in the exponent table instead of the mantissa table (mostly -# for cosmetic reasons, and because it theoretically allows a variant -# that flushes denormal to zero but uses a mantissa table with 24-bit -# entries). -# -# The tables are also constructed slightly differently. -# - -# Note that using a 64K * 4 table is a terrible idea since it will not fit -# in the L1 cache and will massively pollute the L2 cache as well -# -# These should instead fit in the L1 cache. -# -# TODO: we could use a denormal bias table instead of the mantissa/offset -# tables: this would reduce the L1 cache usage from 8704 to 2304 bytes -# but would involve more computation -# -# Note however that if denormals are never encountered, the L1 cache usage -# is only about 4608 bytes anyway. - -table_index = None -table_length = None - -def begin(t, n, l): -	global table_length -	global table_index -	table_index = 0 -	table_length = l -	print -	print "const " + t + " " + n + "[" + str(l) + "] = {" - -def value(v): -	global table_index -	table_index += 1 -	print "\t" + hex(v) + "," - -def end(): -	global table_length -	global table_index -	print "};" -	assert table_index == table_length - -print "/* This file is autogenerated by u_half.py. Do not edit directly. */" -print "#include \"util/u_half.h\"" - -begin("uint32_t", "util_half_to_float_mantissa_table", 2048) -# zero -value(0) - -# denormals -for i in xrange(1, 1024): -	m = i << 13 -	e = 0 - -	# normalize number -	while (m & 0x00800000) == 0: -		e -= 0x00800000 -		m <<= 1 - -	m &= ~0x00800000 -	e += 0x38800000 -	value(m | e) - -# normals -for i in xrange(1024, 2048): -	value((i - 1024) << 13) -end() - -begin("uint32_t", "util_half_to_float_exponent_table", 64) -# positive zero or denormals -value(0) - -# positive numbers -for i in xrange(1, 31): -	value(0x38000000 + (i << 23)) - -# positive infinity/NaN -value(0x7f800000) - -# negative zero or denormals -value(0x80000000) - -# negative numbers -for i in range(33, 63): -	value(0xb8000000 + ((i - 32) << 23)) - -# negative infinity/NaN -value(0xff800000) -end() - -begin("uint32_t", "util_half_to_float_offset_table", 64) -# positive zero or denormals -value(0) - -# positive normals -for i in range(1, 32): -	value(1024) - -# negative zero or denormals -value(0) - -# negative normals -for i in xrange(33, 64): -	value(1024) -end() - -begin("uint16_t", "util_float_to_half_base_table", 512) -for sign in (0, 0x8000): -	# very small numbers mapping to zero -	for i in xrange(-127, -24): -		value(sign | 0) - -	# small numbers mapping to denormals -	for i in xrange(-24, -14): -		value(sign | (0x400 >> (-14 -i))) - -	# normal numbers -	for i in xrange(-14, 16): -		value(sign | ((i + 15) << 10)) - -	# large numbers mapping to infinity -	for i in xrange(16, 128): -		value(sign | 0x7c00) - -	# infinity and NaNs -	value(sign | 0x7c00) -end() - -begin("uint8_t", "util_float_to_half_shift_table", 512) -for sign in (0, 0x8000): -	# very small numbers mapping to zero -	for i in xrange(-127, -24): -		value(24) - -	# small numbers mapping to denormals -	for i in xrange(-24, -14): -		value(-1 - i) - -	# normal numbers -	for i in xrange(-14, 16): -		value(13) - -	# large numbers mapping to infinity -	for i in xrange(16, 128): -		value(24) - -	# infinity and NaNs -	value(13) -end() - diff --git a/mesalib/src/gallium/auxiliary/util/u_math.h b/mesalib/src/gallium/auxiliary/util/u_math.h index f6196665f..724b136b5 100644 --- a/mesalib/src/gallium/auxiliary/util/u_math.h +++ b/mesalib/src/gallium/auxiliary/util/u_math.h @@ -183,6 +183,13 @@ union fi {  }; +union di { +   double d; +   int64_t i; +   uint64_t ui; +}; + +  /**   * Fast version of 2^x   * Identity: exp2(a + b) = exp2(a) * exp2(b) @@ -325,14 +332,107 @@ util_is_approx(float a, float b, float tol)  /** - * Test if x is NaN or +/- infinity. + * util_is_X_inf_or_nan = test if x is NaN or +/- Inf + * util_is_X_nan        = test if x is NaN + * util_X_inf_sign      = return +1 for +Inf, -1 for -Inf, or 0 for not Inf + * + * NaN can be checked with x != x, however this fails with the fast math flag + **/ + + +/** + * Single-float   */  static INLINE boolean  util_is_inf_or_nan(float x)  {     union fi tmp;     tmp.f = x; -   return !(int)((unsigned int)((tmp.i & 0x7fffffff)-0x7f800000) >> 31); +   return (tmp.ui & 0x7f800000) == 0x7f800000; +} + + +static INLINE boolean +util_is_nan(float x) +{ +   union fi tmp; +   tmp.f = x; +   return (tmp.ui & 0x7fffffff) > 0x7f800000; +} + + +static INLINE int +util_inf_sign(float x) +{ +   union fi tmp; +   tmp.f = x; +   if ((tmp.ui & 0x7fffffff) != 0x7f800000) { +      return 0; +   } + +   return (x < 0) ? -1 : 1; +} + + +/** + * Double-float + */ +static INLINE boolean +util_is_double_inf_or_nan(double x) +{ +   union di tmp; +   tmp.d = x; +   return (tmp.ui & 0x7ff0000000000000) == 0x7ff0000000000000; +} + + +static INLINE boolean +util_is_double_nan(double x) +{ +   union di tmp; +   tmp.d = x; +   return (tmp.ui & 0x7fffffffffffffff) > 0x7ff0000000000000; +} + + +static INLINE int +util_double_inf_sign(double x) +{ +   union di tmp; +   tmp.d = x; +   if ((tmp.ui & 0x7fffffffffffffff) != 0x7ff0000000000000) { +      return 0; +   } + +   return (x < 0) ? -1 : 1; +} + + +/** + * Half-float + */ +static INLINE boolean +util_is_half_inf_or_nan(int16_t x) +{ +   return (x & 0x7c00) == 0x7c00; +} + + +static INLINE boolean +util_is_half_nan(int16_t x) +{ +   return (x & 0x7fff) > 0x7c00; +} + + +static INLINE int +util_half_inf_sign(int16_t x) +{ +   if ((x & 0x7fff) != 0x7c00) { +      return 0; +   } + +   return (x < 0) ? -1 : 1;  } diff --git a/mesalib/src/mesa/main/mtypes.h b/mesalib/src/mesa/main/mtypes.h index 5768ed7cd..bdbb5137e 100644 --- a/mesalib/src/mesa/main/mtypes.h +++ b/mesalib/src/mesa/main/mtypes.h @@ -3200,12 +3200,6 @@ struct gl_dlist_state     GLubyte ActiveMaterialSize[MAT_ATTRIB_MAX];     GLfloat CurrentMaterial[MAT_ATTRIB_MAX][4]; -   GLubyte ActiveIndex; -   GLfloat CurrentIndex; -    -   GLubyte ActiveEdgeFlag; -   GLboolean CurrentEdgeFlag; -     struct {        /* State known to have been set by the currently-compiling display         * list.  Used to eliminate some redundant state changes. diff --git a/mesalib/src/mesa/state_tracker/st_cb_fbo.c b/mesalib/src/mesa/state_tracker/st_cb_fbo.c index 10f4e09cf..e1818abb9 100644 --- a/mesalib/src/mesa/state_tracker/st_cb_fbo.c +++ b/mesalib/src/mesa/state_tracker/st_cb_fbo.c @@ -57,10 +57,6 @@  #include "util/u_surface.h" -/** Set to 1 to enable extra debug code */ -#define ST_DEBUG_FBO 0 - -  static GLboolean  st_renderbuffer_alloc_sw_storage(struct gl_context * ctx,                                   struct gl_renderbuffer *rb, @@ -480,9 +476,9 @@ st_finish_render_texture(struct gl_context *ctx,  static void  st_fbo_invalid(const char *reason)  { -#if ST_DEBUG_FBO -   debug_printf("Invalid FBO: %s\n", reason); -#endif +   if (MESA_DEBUG_FLAGS & DEBUG_INCOMPLETE_FBO) { +      _mesa_debug(NULL, "Invalid FBO: %s\n", reason); +   }  } diff --git a/pixman/pixman/Makefile.am b/pixman/pixman/Makefile.am index 1b232ad0f..deacf8728 100644 --- a/pixman/pixman/Makefile.am +++ b/pixman/pixman/Makefile.am @@ -92,6 +92,7 @@ endif  # iwmmxt code  if USE_ARM_IWMMXT +libpixman_iwmmxt_la_SOURCES = pixman-mmx.c  noinst_LTLIBRARIES += libpixman-iwmmxt.la  libpixman_1_la_LIBADD += libpixman-iwmmxt.la diff --git a/pixman/pixman/loongson-mmintrin.h b/pixman/pixman/loongson-mmintrin.h index 1a114fe0f..086c6e0f1 100644 --- a/pixman/pixman/loongson-mmintrin.h +++ b/pixman/pixman/loongson-mmintrin.h @@ -45,6 +45,28 @@ _mm_setzero_si64 (void)  }  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pi16 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("paddh %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pi32 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("paddw %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))  _mm_adds_pu16 (__m64 __m1, __m64 __m2)  {  	__m64 ret; @@ -150,6 +172,78 @@ _mm_packs_pu16 (__m64 __m1, __m64 __m2)  }  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_pi32 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("packsswh %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pi16 (uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0) +{ +	if (__builtin_constant_p (__w3) && +	    __builtin_constant_p (__w2) && +	    __builtin_constant_p (__w1) && +	    __builtin_constant_p (__w0)) +	{ +		uint64_t val = ((uint64_t)__w3 << 48) +			     | ((uint64_t)__w2 << 32) +			     | ((uint64_t)__w1 << 16) +			     | ((uint64_t)__w0 <<  0); +		return *(__m64 *)&val; +	} +	else if (__w3 == __w2 && __w2 == __w1 && __w1 == __w0) +	{ +		/* TODO: handle other cases */ +		uint64_t val = __w3; +		uint64_t imm = _MM_SHUFFLE (0, 0, 0, 0); +		__m64 ret; +		asm("pshufh %0, %1, %2\n\t" +		    : "=f" (ret) +		    : "f" (*(__m64 *)&val), "f" (*(__m64 *)&imm) +		); +		return ret; +	} +	uint64_t val = ((uint64_t)__w3 << 48) +		     | ((uint64_t)__w2 << 32) +		     | ((uint64_t)__w1 << 16) +		     | ((uint64_t)__w0 <<  0); +	return *(__m64 *)&val; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pi32 (unsigned __i1, unsigned __i0) +{ +	if (__builtin_constant_p (__i1) && +	    __builtin_constant_p (__i0)) +	{ +		uint64_t val = ((uint64_t)__i1 << 32) +			     | ((uint64_t)__i0 <<  0); +		return *(__m64 *)&val; +	} +	else if (__i1 == __i0) +	{ +		uint64_t imm = _MM_SHUFFLE (1, 0, 1, 0); +		__m64 ret; +		asm("pshufh %0, %1, %2\n\t" +		    : "=f" (ret) +		    : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm) +		); +		return ret; +	} +	uint64_t val = ((uint64_t)__i1 << 32) +		     | ((uint64_t)__i0 <<  0); +	return *(__m64 *)&val; +} +#undef _MM_SHUFFLE + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))  _mm_shuffle_pi16 (__m64 __m, int64_t __n)  {  	__m64 ret; @@ -193,6 +287,17 @@ _mm_srli_pi16 (__m64 __m, int64_t __count)  }  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_pi32 (__m64 __m, int64_t __count) +{ +	__m64 ret; +	asm("psrlw %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m), "f" (*(__m64 *)&__count) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))  _mm_srli_si64 (__m64 __m, int64_t __count)  {  	__m64 ret; @@ -204,6 +309,17 @@ _mm_srli_si64 (__m64 __m, int64_t __count)  }  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_pi16 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("psubh %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))  _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)  {  	__m64 ret; diff --git a/pixman/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman/pixman-arm-neon-asm-bilinear.S index f7913adb7..e37b5c298 100644 --- a/pixman/pixman/pixman-arm-neon-asm-bilinear.S +++ b/pixman/pixman/pixman-arm-neon-asm-bilinear.S @@ -64,6 +64,7 @@  .altmacro  .p2align 2 +#include "pixman-private.h"  #include "pixman-arm-neon-asm.h"  /* @@ -488,12 +489,12 @@ fname:      vmull.u8  q1, d0, d28      vmlal.u8  q1, d1, d29      /* 5 cycles bubble */ -    vshll.u16 q0, d2, #8 +    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q0, d2, d30      vmlal.u16 q0, d3, d30      /* 5 cycles bubble */      bilinear_duplicate_mask mask_fmt, 1, d4 -    vshrn.u32 d0, q0, #16 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)      /* 3 cycles bubble */      vmovn.u16 d0, q0      /* 1 cycle bubble */ @@ -514,16 +515,16 @@ fname:                  q1, q11, d0, d1, d20, d21, d22, d23      bilinear_load_mask mask_fmt, 2, d4      bilinear_load_dst dst_fmt, op, 2, d18, d19, q9 -    vshll.u16 q0, d2, #8 +    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q0, d2, d30      vmlal.u16 q0, d3, d30 -    vshll.u16 q10, d22, #8 +    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q10, d22, d31      vmlal.u16 q10, d23, d31 -    vshrn.u32 d0, q0, #16 -    vshrn.u32 d1, q10, #16 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)      bilinear_duplicate_mask mask_fmt, 2, d4 -    vshr.u16  q15, q12, #8 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)      vadd.u16  q12, q12, q13      vmovn.u16 d0, q0      bilinear_interleave_src_dst \ @@ -544,29 +545,29 @@ fname:                  q3, q9,  d4, d5, d16, d17, d18, d19      pld       [TMP1, PF_OFFS]      sub       TMP1, TMP1, STRIDE -    vshll.u16 q0, d2, #8 +    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q0, d2, d30      vmlal.u16 q0, d3, d30 -    vshll.u16 q10, d22, #8 +    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q10, d22, d31      vmlal.u16 q10, d23, d31 -    vshr.u16  q15, q12, #8 -    vshll.u16 q2, d6, #8 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q2, d6, d30      vmlal.u16 q2, d7, d30 -    vshll.u16 q8, d18, #8 +    vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS      bilinear_load_mask mask_fmt, 4, d22      bilinear_load_dst dst_fmt, op, 4, d2, d3, q1      pld       [TMP1, PF_OFFS]      vmlsl.u16 q8, d18, d31      vmlal.u16 q8, d19, d31      vadd.u16  q12, q12, q13 -    vshrn.u32 d0, q0, #16 -    vshrn.u32 d1, q10, #16 -    vshrn.u32 d4, q2, #16 -    vshrn.u32 d5, q8, #16 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)      bilinear_duplicate_mask mask_fmt, 4, d22 -    vshr.u16  q15, q12, #8 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)      vmovn.u16 d0, q0      vmovn.u16 d1, q2      vadd.u16  q12, q12, q13 @@ -694,13 +695,13 @@ pixman_asm_function fname      blt       0f      tst       OUT, #(1 << dst_bpp_shift)      beq       0f -    vshr.u16  q15, q12, #8 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)      vadd.u16  q12, q12, q13      bilinear_process_last_pixel      sub       WIDTH, WIDTH, #1  0:      vadd.u16  q13, q13, q13 -    vshr.u16  q15, q12, #8 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)      vadd.u16  q12, q12, q13      cmp       WIDTH, #2 @@ -921,7 +922,7 @@ pixman_asm_function fname      vmull.u8    q10, d22, d28      vmlal.u8    q10, d23, d29 -    vshll.u16   q0, d16, #8 +    vshll.u16   q0, d16, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16   q0, d16, d30      vmlal.u16   q0, d17, d30 @@ -932,27 +933,27 @@ pixman_asm_function fname      vmull.u8    q11, d16, d28      vmlal.u8    q11, d17, d29 -    vshll.u16   q1, d18, #8 +    vshll.u16   q1, d18, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16   q1, d18, d31      vmlal.u16   q1, d19, d31 -    vshr.u16    q15, q12, #8 +    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)      vadd.u16    q12, q12, q13  .endm  .macro bilinear_over_8888_8888_process_pixblock_tail -    vshll.u16   q2, d20, #8 +    vshll.u16   q2, d20, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16   q2, d20, d30      vmlal.u16   q2, d21, d30 -    vshll.u16   q3, d22, #8 +    vshll.u16   q3, d22, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16   q3, d22, d31      vmlal.u16   q3, d23, d31 -    vshrn.u32   d0, q0, #16 -    vshrn.u32   d1, q1, #16 +    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)      vld1.32     {d2, d3}, [OUT, :128]      pld         [OUT, #(prefetch_offset * 4)] -    vshrn.u32   d4, q2, #16 -    vshr.u16    q15, q12, #8 -    vshrn.u32   d5, q3, #16 +    vshrn.u32   d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vshrn.u32   d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)      vmovn.u16   d6, q0      vmovn.u16   d7, q2      vuzp.8      d6, d7 @@ -975,7 +976,7 @@ pixman_asm_function fname  .endm  .macro bilinear_over_8888_8888_process_pixblock_tail_head -                                            vshll.u16   q2, d20, #8 +                                            vshll.u16   q2, d20, #BILINEAR_INTERPOLATION_BITS      mov         TMP1, X, asr #16      add         X, X, UX      add         TMP1, TOP, TMP1, asl #2 @@ -984,21 +985,21 @@ pixman_asm_function fname      add         X, X, UX      add         TMP2, TOP, TMP2, asl #2                                              vmlal.u16   q2, d21, d30 -                                            vshll.u16   q3, d22, #8 +                                            vshll.u16   q3, d22, #BILINEAR_INTERPOLATION_BITS      vld1.32     {d20}, [TMP1], STRIDE                                              vmlsl.u16   q3, d22, d31                                              vmlal.u16   q3, d23, d31      vld1.32     {d21}, [TMP1]      vmull.u8    q8, d20, d28      vmlal.u8    q8, d21, d29 -                                            vshrn.u32   d0, q0, #16 -                                            vshrn.u32   d1, q1, #16 +                                            vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +                                            vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)                                              vld1.32     {d2, d3}, [OUT, :128]                                              pld         [OUT, PF_OFFS] -                                            vshrn.u32   d4, q2, #16 -                                            vshr.u16    q15, q12, #8 +                                            vshrn.u32   d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) +                                            vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)      vld1.32     {d22}, [TMP2], STRIDE -                                            vshrn.u32   d5, q3, #16 +                                            vshrn.u32   d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)                                              vmovn.u16   d6, q0      vld1.32     {d23}, [TMP2]      vmull.u8    q9, d22, d28 @@ -1022,7 +1023,7 @@ pixman_asm_function fname      vmlal.u8    q10, d23, d29                                              vmull.u8    q11, d2, d4                                              vmull.u8    q2, d3, d4 -    vshll.u16   q0, d16, #8 +    vshll.u16   q0, d16, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16   q0, d16, d30                                              vrshr.u16   q1, q11, #8      vmlal.u16   q0, d17, d30 @@ -1037,12 +1038,12 @@ pixman_asm_function fname      vmull.u8    q11, d16, d28      vmlal.u8    q11, d17, d29                                              vuzp.8      d6, d7 -    vshll.u16   q1, d18, #8 +    vshll.u16   q1, d18, #BILINEAR_INTERPOLATION_BITS                                              vuzp.8      d6, d7      vmlsl.u16   q1, d18, d31                                              vadd.u16    q12, q12, q13      vmlal.u16   q1, d19, d31 -    vshr.u16    q15, q12, #8 +    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)      vadd.u16    q12, q12, q13                                              vst1.32     {d6, d7}, [OUT, :128]!  .endm @@ -1081,14 +1082,14 @@ pixman_asm_function fname      vmull.u8    q3, d2, d28      vmlal.u8    q2, d1, d29      vmlal.u8    q3, d3, d29 -    vshll.u16   q0, d4, #8 -    vshll.u16   q1, d6, #8 +    vshll.u16   q0, d4, #BILINEAR_INTERPOLATION_BITS +    vshll.u16   q1, d6, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16   q0, d4, d30      vmlsl.u16   q1, d6, d31      vmlal.u16   q0, d5, d30      vmlal.u16   q1, d7, d31 -    vshrn.u32   d0, q0, #16 -    vshrn.u32   d1, q1, #16 +    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)      vld1.32     {d2}, [TMP3], STRIDE      vld1.32     {d3}, [TMP3]      pld         [TMP4, PF_OFFS] @@ -1099,7 +1100,7 @@ pixman_asm_function fname      vmlal.u8    q3, d3, d29      vmull.u8    q1, d4, d28      vmlal.u8    q1, d5, d29 -    vshr.u16    q15, q12, #8 +    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)      vld1.32     {d22[0]}, [MASK]!      pld         [MASK, #prefetch_offset]      vadd.u16    q12, q12, q13 @@ -1107,17 +1108,17 @@ pixman_asm_function fname  .endm  .macro bilinear_over_8888_8_8888_process_pixblock_tail -    vshll.u16   q9, d6, #8 -    vshll.u16   q10, d2, #8 +    vshll.u16   q9, d6, #BILINEAR_INTERPOLATION_BITS +    vshll.u16   q10, d2, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16   q9, d6, d30      vmlsl.u16   q10, d2, d31      vmlal.u16   q9, d7, d30      vmlal.u16   q10, d3, d31 -    vshr.u16    q15, q12, #8 +    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)      vadd.u16    q12, q12, q13      vdup.32     d22, d22[0] -    vshrn.u32   d18, q9, #16 -    vshrn.u32   d19, q10, #16 +    vshrn.u32   d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32   d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)      vmovn.u16   d17, q9      vld1.32     {d18, d19}, [OUT, :128]      pld         [OUT, PF_OFFS] @@ -1146,11 +1147,11 @@ pixman_asm_function fname  .endm  .macro bilinear_over_8888_8_8888_process_pixblock_tail_head -                                            vshll.u16   q9, d6, #8 +                                            vshll.u16   q9, d6, #BILINEAR_INTERPOLATION_BITS      mov         TMP1, X, asr #16      add         X, X, UX      add         TMP1, TOP, TMP1, asl #2 -                                            vshll.u16   q10, d2, #8 +                                            vshll.u16   q10, d2, #BILINEAR_INTERPOLATION_BITS      vld1.32     {d0}, [TMP1], STRIDE      mov         TMP2, X, asr #16      add         X, X, UX @@ -1167,12 +1168,12 @@ pixman_asm_function fname      mov         TMP4, X, asr #16      add         X, X, UX      add         TMP4, TOP, TMP4, asl #2 -                                            vshr.u16    q15, q12, #8 +                                            vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)                                              vadd.u16    q12, q12, q13      vld1.32     {d3}, [TMP2]                                              vdup.32     d22, d22[0] -                                            vshrn.u32   d18, q9, #16 -                                            vshrn.u32   d19, q10, #16 +                                            vshrn.u32   d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS) +                                            vshrn.u32   d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)      vmull.u8    q2, d0, d28      vmull.u8    q3, d2, d28                                              vmovn.u16   d17, q9 @@ -1182,8 +1183,8 @@ pixman_asm_function fname      vmlal.u8    q3, d3, d29                                              vuzp.8      d16, d17                                              vuzp.8      d18, d19 -    vshll.u16   q0, d4, #8 -    vshll.u16   q1, d6, #8 +    vshll.u16   q0, d4, #BILINEAR_INTERPOLATION_BITS +    vshll.u16   q1, d6, #BILINEAR_INTERPOLATION_BITS                                              vuzp.8      d16, d17                                              vuzp.8      d18, d19      vmlsl.u16   q0, d4, d30 @@ -1194,8 +1195,8 @@ pixman_asm_function fname      vmlal.u16   q1, d7, d31                                              vrsra.u16   q10, q10, #8                                              vrsra.u16   q11, q11, #8 -    vshrn.u32   d0, q0, #16 -    vshrn.u32   d1, q1, #16 +    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)                                              vrshrn.u16  d16, q10, #8                                              vrshrn.u16  d17, q11, #8      vld1.32     {d2}, [TMP3], STRIDE @@ -1216,7 +1217,7 @@ pixman_asm_function fname                                              vraddhn.u16 d18, q9, q10                                              vraddhn.u16 d19, q15, q11      vmlal.u8    q1, d5, d29 -    vshr.u16    q15, q12, #8 +    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)                                              vqadd.u8    q9, q8, q9      vld1.32     {d22[0]}, [MASK]!                                              vuzp.8      d18, d19 diff --git a/pixman/pixman/pixman-arm-neon-asm.S b/pixman/pixman/pixman-arm-neon-asm.S index 87aae1d55..187197dc3 100644 --- a/pixman/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman/pixman-arm-neon-asm.S @@ -49,6 +49,7 @@      .altmacro      .p2align 2 +#include "pixman-private.h"  #include "pixman-arm-neon-asm.h"  /* Global configuration options and preferences */ @@ -2986,11 +2987,11 @@ fname:      vmull.u8  q1, d0, d28      vmlal.u8  q1, d1, d29      /* 5 cycles bubble */ -    vshll.u16 q0, d2, #8 +    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q0, d2, d30      vmlal.u16 q0, d3, d30      /* 5 cycles bubble */ -    vshrn.u32 d0, q0, #16 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)      /* 3 cycles bubble */      vmovn.u16 d0, q0      /* 1 cycle bubble */ @@ -3000,15 +3001,15 @@ fname:  .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt      bilinear_load_and_vertical_interpolate_two_&src_fmt \                  q1, q11, d0, d1, d20, d21, d22, d23 -    vshll.u16 q0, d2, #8 +    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q0, d2, d30      vmlal.u16 q0, d3, d30 -    vshll.u16 q10, d22, #8 +    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q10, d22, d31      vmlal.u16 q10, d23, d31 -    vshrn.u32 d0, q0, #16 -    vshrn.u32 d1, q10, #16 -    vshr.u16  q15, q12, #8 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)      vadd.u16  q12, q12, q13      vmovn.u16 d0, q0      bilinear_store_&dst_fmt 2, q2, q3 @@ -3020,26 +3021,26 @@ fname:                  q3, q9,  d4, d5, d16, d17, d18, d19      pld       [TMP1, PF_OFFS]      sub       TMP1, TMP1, STRIDE -    vshll.u16 q0, d2, #8 +    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q0, d2, d30      vmlal.u16 q0, d3, d30 -    vshll.u16 q10, d22, #8 +    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q10, d22, d31      vmlal.u16 q10, d23, d31 -    vshr.u16  q15, q12, #8 -    vshll.u16 q2, d6, #8 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q2, d6, d30      vmlal.u16 q2, d7, d30 -    vshll.u16 q8, d18, #8 +    vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS      pld       [TMP2, PF_OFFS]      vmlsl.u16 q8, d18, d31      vmlal.u16 q8, d19, d31      vadd.u16  q12, q12, q13 -    vshrn.u32 d0, q0, #16 -    vshrn.u32 d1, q10, #16 -    vshrn.u32 d4, q2, #16 -    vshrn.u32 d5, q8, #16 -    vshr.u16  q15, q12, #8 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)      vmovn.u16 d0, q0      vmovn.u16 d1, q2      vadd.u16  q12, q12, q13 @@ -3158,13 +3159,13 @@ pixman_asm_function fname      blt       0f      tst       OUT, #(1 << dst_bpp_shift)      beq       0f -    vshr.u16  q15, q12, #8 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)      vadd.u16  q12, q12, q13      bilinear_interpolate_last_pixel src_fmt, dst_fmt      sub       WIDTH, WIDTH, #1  0:      vadd.u16  q13, q13, q13 -    vshr.u16  q15, q12, #8 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)      vadd.u16  q12, q12, q13      cmp       WIDTH, #2 @@ -3282,7 +3283,7 @@ pixman_asm_function fname      vmull.u8  q10, d22, d28      vmlal.u8  q10, d23, d29 -    vshll.u16 q0, d16, #8 +    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q0, d16, d30      vmlal.u16 q0, d17, d30 @@ -3293,25 +3294,25 @@ pixman_asm_function fname      vmull.u8  q11, d16, d28      vmlal.u8  q11, d17, d29 -    vshll.u16 q1, d18, #8 +    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q1, d18, d31  .endm  .macro bilinear_interpolate_four_pixels_8888_8888_tail      vmlal.u16 q1, d19, d31 -    vshr.u16  q15, q12, #8 -    vshll.u16 q2, d20, #8 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q2, d20, d30      vmlal.u16 q2, d21, d30 -    vshll.u16 q3, d22, #8 +    vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q3, d22, d31      vmlal.u16 q3, d23, d31      vadd.u16  q12, q12, q13 -    vshrn.u32 d0, q0, #16 -    vshrn.u32 d1, q1, #16 -    vshrn.u32 d4, q2, #16 -    vshr.u16  q15, q12, #8 -    vshrn.u32 d5, q3, #16 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)      vmovn.u16 d6, q0      vmovn.u16 d7, q2      vadd.u16  q12, q12, q13 @@ -3326,22 +3327,22 @@ pixman_asm_function fname      add       X, X, UX      add       TMP2, TOP, TMP2, asl #2          vmlal.u16 q1, d19, d31 -        vshr.u16  q15, q12, #8 -        vshll.u16 q2, d20, #8 +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS          vmlsl.u16 q2, d20, d30          vmlal.u16 q2, d21, d30 -        vshll.u16 q3, d22, #8 +        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS      vld1.32   {d20}, [TMP1], STRIDE          vmlsl.u16 q3, d22, d31          vmlal.u16 q3, d23, d31      vld1.32   {d21}, [TMP1]      vmull.u8  q8, d20, d28      vmlal.u8  q8, d21, d29 -        vshrn.u32 d0, q0, #16 -        vshrn.u32 d1, q1, #16 -        vshrn.u32 d4, q2, #16 +        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) +        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)      vld1.32   {d22}, [TMP2], STRIDE -        vshrn.u32 d5, q3, #16 +        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)          vadd.u16  q12, q12, q13      vld1.32   {d23}, [TMP2]      vmull.u8  q9, d22, d28 @@ -3353,12 +3354,12 @@ pixman_asm_function fname      add       TMP4, TOP, TMP4, asl #2      vmlal.u8  q9, d23, d29      vld1.32   {d22}, [TMP3], STRIDE -        vshr.u16  q15, q12, #8 +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)      vld1.32   {d23}, [TMP3]      vmull.u8  q10, d22, d28      vmlal.u8  q10, d23, d29          vmovn.u16 d6, q0 -    vshll.u16 q0, d16, #8 +    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS          vmovn.u16 d7, q2      vmlsl.u16 q0, d16, d30      vmlal.u16 q0, d17, d30 @@ -3370,7 +3371,7 @@ pixman_asm_function fname      vmull.u8  q11, d16, d28      vmlal.u8  q11, d17, d29          vst1.32   {d6, d7}, [OUT, :128]! -    vshll.u16 q1, d18, #8 +    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q1, d18, d31  .endm @@ -3403,7 +3404,7 @@ pixman_asm_function fname      vld1.32   {d23}, [TMP3]      vmull.u8  q10, d22, d28      vmlal.u8  q10, d23, d29 -    vshll.u16 q0, d16, #8 +    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q0, d16, d30      vmlal.u16 q0, d17, d30      pld       [TMP4, PF_OFFS] @@ -3412,7 +3413,7 @@ pixman_asm_function fname      pld       [TMP4, PF_OFFS]      vmull.u8  q11, d16, d28      vmlal.u8  q11, d17, d29 -    vshll.u16 q1, d18, #8 +    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q1, d18, d31      mov       TMP1, X, asr #16 @@ -3422,22 +3423,22 @@ pixman_asm_function fname      add       X, X, UX      add       TMP2, TOP, TMP2, asl #2          vmlal.u16 q1, d19, d31 -        vshr.u16  q15, q12, #8 -        vshll.u16 q2, d20, #8 +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS          vmlsl.u16 q2, d20, d30          vmlal.u16 q2, d21, d30 -        vshll.u16 q3, d22, #8 +        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS      vld1.32   {d20}, [TMP1], STRIDE          vmlsl.u16 q3, d22, d31          vmlal.u16 q3, d23, d31      vld1.32   {d21}, [TMP1]      vmull.u8  q8, d20, d28      vmlal.u8  q8, d21, d29 -        vshrn.u32 d0, q0, #16 -        vshrn.u32 d1, q1, #16 -        vshrn.u32 d4, q2, #16 +        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) +        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)      vld1.32   {d22}, [TMP2], STRIDE -        vshrn.u32 d5, q3, #16 +        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)          vadd.u16  q12, q12, q13      vld1.32   {d23}, [TMP2]      vmull.u8  q9, d22, d28 @@ -3449,12 +3450,12 @@ pixman_asm_function fname      add       TMP4, TOP, TMP4, asl #2      vmlal.u8  q9, d23, d29      vld1.32   {d22}, [TMP3], STRIDE -        vshr.u16  q15, q12, #8 +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)      vld1.32   {d23}, [TMP3]      vmull.u8  q10, d22, d28      vmlal.u8  q10, d23, d29          vmovn.u16 d8, q0 -    vshll.u16 q0, d16, #8 +    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS          vmovn.u16 d9, q2      vmlsl.u16 q0, d16, d30      vmlal.u16 q0, d17, d30 @@ -3465,25 +3466,25 @@ pixman_asm_function fname      pld       [TMP4, PF_OFFS]      vmull.u8  q11, d16, d28      vmlal.u8  q11, d17, d29 -    vshll.u16 q1, d18, #8 +    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q1, d18, d31  .endm  .macro bilinear_interpolate_eight_pixels_8888_0565_tail      vmlal.u16 q1, d19, d31 -    vshr.u16  q15, q12, #8 -    vshll.u16 q2, d20, #8 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q2, d20, d30      vmlal.u16 q2, d21, d30 -    vshll.u16 q3, d22, #8 +    vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q3, d22, d31      vmlal.u16 q3, d23, d31      vadd.u16  q12, q12, q13 -    vshrn.u32 d0, q0, #16 -    vshrn.u32 d1, q1, #16 -    vshrn.u32 d4, q2, #16 -    vshr.u16  q15, q12, #8 -    vshrn.u32 d5, q3, #16 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)      vmovn.u16 d10, q0      vmovn.u16 d11, q2      vadd.u16  q12, q12, q13 @@ -3508,23 +3509,23 @@ pixman_asm_function fname      add       X, X, UX      add       TMP2, TOP, TMP2, asl #2          vmlal.u16 q1, d19, d31 -        vshr.u16  q15, q12, #8 +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)              vuzp.u8 d8, d9 -        vshll.u16 q2, d20, #8 +        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS          vmlsl.u16 q2, d20, d30          vmlal.u16 q2, d21, d30 -        vshll.u16 q3, d22, #8 +        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS      vld1.32   {d20}, [TMP1], STRIDE          vmlsl.u16 q3, d22, d31          vmlal.u16 q3, d23, d31      vld1.32   {d21}, [TMP1]      vmull.u8  q8, d20, d28      vmlal.u8  q8, d21, d29 -        vshrn.u32 d0, q0, #16 -        vshrn.u32 d1, q1, #16 -        vshrn.u32 d4, q2, #16 +        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) +        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)      vld1.32   {d22}, [TMP2], STRIDE -        vshrn.u32 d5, q3, #16 +        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)          vadd.u16  q12, q12, q13      vld1.32   {d23}, [TMP2]      vmull.u8  q9, d22, d28 @@ -3536,12 +3537,12 @@ pixman_asm_function fname      add       TMP4, TOP, TMP4, asl #2      vmlal.u8  q9, d23, d29      vld1.32   {d22}, [TMP3], STRIDE -        vshr.u16  q15, q12, #8 +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)      vld1.32   {d23}, [TMP3]      vmull.u8  q10, d22, d28      vmlal.u8  q10, d23, d29          vmovn.u16 d10, q0 -    vshll.u16 q0, d16, #8 +    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS          vmovn.u16 d11, q2      vmlsl.u16 q0, d16, d30      vmlal.u16 q0, d17, d30 @@ -3553,7 +3554,7 @@ pixman_asm_function fname      vmull.u8  q11, d16, d28      vmlal.u8  q11, d17, d29              vuzp.u8 d10, d11 -    vshll.u16 q1, d18, #8 +    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS      vmlsl.u16 q1, d18, d31      mov       TMP1, X, asr #16 @@ -3564,12 +3565,12 @@ pixman_asm_function fname      add       TMP2, TOP, TMP2, asl #2          vmlal.u16 q1, d19, d31              vuzp.u8 d9, d11 -        vshr.u16  q15, q12, #8 -        vshll.u16 q2, d20, #8 +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS              vuzp.u8 d8, d10          vmlsl.u16 q2, d20, d30          vmlal.u16 q2, d21, d30 -        vshll.u16 q3, d22, #8 +        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS      vld1.32   {d20}, [TMP1], STRIDE          vmlsl.u16 q3, d22, d31          vmlal.u16 q3, d23, d31 @@ -3579,13 +3580,13 @@ pixman_asm_function fname              vshll.u8  q6, d9, #8              vshll.u8  q5, d10, #8              vshll.u8  q7, d8, #8 -        vshrn.u32 d0, q0, #16 +        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)              vsri.u16  q5, q6, #5 -        vshrn.u32 d1, q1, #16 +        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)              vsri.u16  q5, q7, #11 -        vshrn.u32 d4, q2, #16 +        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)      vld1.32   {d22}, [TMP2], STRIDE -        vshrn.u32 d5, q3, #16 +        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)          vadd.u16  q12, q12, q13      vld1.32   {d23}, [TMP2]      vmull.u8  q9, d22, d28 @@ -3597,12 +3598,12 @@ pixman_asm_function fname      add       TMP4, TOP, TMP4, asl #2      vmlal.u8  q9, d23, d29      vld1.32   {d22}, [TMP3], STRIDE -        vshr.u16  q15, q12, #8 +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)      vld1.32   {d23}, [TMP3]      vmull.u8  q10, d22, d28      vmlal.u8  q10, d23, d29          vmovn.u16 d8, q0 -    vshll.u16 q0, d16, #8 +    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS          vmovn.u16 d9, q2      vmlsl.u16 q0, d16, d30      vmlal.u16 q0, d17, d30 @@ -3613,7 +3614,7 @@ pixman_asm_function fname      pld       [TMP4, PF_OFFS]      vmull.u8  q11, d16, d28      vmlal.u8  q11, d17, d29 -    vshll.u16 q1, d18, #8 +    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS              vst1.32   {d10, d11}, [OUT, :128]!      vmlsl.u16 q1, d18, d31  .endm diff --git a/pixman/pixman/pixman-bits-image.c b/pixman/pixman/pixman-bits-image.c index 05eab9634..b6c8630f4 100644 --- a/pixman/pixman/pixman-bits-image.c +++ b/pixman/pixman/pixman-bits-image.c @@ -131,8 +131,8 @@ bits_image_fetch_pixel_bilinear (bits_image_t   *image,      x1 = x - pixman_fixed_1 / 2;      y1 = y - pixman_fixed_1 / 2; -    distx = (x1 >> 8) & 0xff; -    disty = (y1 >> 8) & 0xff; +    distx = pixman_fixed_to_bilinear_weight (x1); +    disty = pixman_fixed_to_bilinear_weight (y1);      x1 = pixman_fixed_to_int (x1);      y1 = pixman_fixed_to_int (y1); @@ -200,7 +200,7 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter,      x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;      y = v.vector[1] - pixman_fixed_1/2; -    disty = (y >> 8) & 0xff; +    disty = pixman_fixed_to_bilinear_weight (y);      /* Load the pointers to the first and second lines from the source       * image that bilinear code must read. @@ -309,7 +309,7 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter,  	tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask;  	br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask; -	distx = (x >> 8) & 0xff; +	distx = pixman_fixed_to_bilinear_weight (x);  	*buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty); @@ -334,7 +334,7 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter,  	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;  	    br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask; -	    distx = (x >> 8) & 0xff; +	    distx = pixman_fixed_to_bilinear_weight (x);  	    *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty);  	} @@ -358,7 +358,7 @@ bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter,  	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;  	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask; -	    distx = (x >> 8) & 0xff; +	    distx = pixman_fixed_to_bilinear_weight (x);  	    *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty);  	} @@ -695,8 +695,8 @@ bits_image_fetch_bilinear_affine (pixman_image_t * image,  	x1 = x - pixman_fixed_1 / 2;  	y1 = y - pixman_fixed_1 / 2; -	distx = (x1 >> 8) & 0xff; -	disty = (y1 >> 8) & 0xff; +	distx = pixman_fixed_to_bilinear_weight (x1); +	disty = pixman_fixed_to_bilinear_weight (y1);  	y1 = pixman_fixed_to_int (y1);  	y2 = y1 + 1; diff --git a/pixman/pixman/pixman-inlines.h b/pixman/pixman/pixman-inlines.h index 3532867a4..5517de5a5 100644 --- a/pixman/pixman/pixman-inlines.h +++ b/pixman/pixman/pixman-inlines.h @@ -81,6 +81,13 @@ repeat (pixman_repeat_t repeat, int *c, int size)      return TRUE;  } +static force_inline int +pixman_fixed_to_bilinear_weight (pixman_fixed_t x) +{ +    return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) & +	   ((1 << BILINEAR_INTERPOLATION_BITS) - 1); +} +  #if SIZEOF_LONG > 4  static force_inline uint32_t @@ -92,6 +99,9 @@ bilinear_interpolation (uint32_t tl, uint32_t tr,      uint64_t tl64, tr64, bl64, br64;      uint64_t f, r; +    distx <<= (8 - BILINEAR_INTERPOLATION_BITS); +    disty <<= (8 - BILINEAR_INTERPOLATION_BITS); +      distxy = distx * disty;      distxiy = distx * (256 - disty);      distixy = (256 - distx) * disty; @@ -135,6 +145,9 @@ bilinear_interpolation (uint32_t tl, uint32_t tr,      int distxy, distxiy, distixy, distixiy;      uint32_t f, r; +    distx <<= (8 - BILINEAR_INTERPOLATION_BITS); +    disty <<= (8 - BILINEAR_INTERPOLATION_BITS); +      distxy = distx * disty;      distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */      distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */ @@ -758,12 +771,14 @@ bilinear_pad_repeat_get_scanline_bounds (int32_t         source_image_width,   *                        all source pixels are fetched from zero padding   *                        zone for NONE repeat   * - * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to 256, - *       but sometimes it may be less than that for NONE repeat when handling - *       fuzzy antialiased top or bottom image edges. Also both top and - *       bottom weight variables are guaranteed to have value in 0-255 - *       range and can fit into unsigned byte or be used with 8-bit SIMD - *       multiplication instructions. + * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to + *       BILINEAR_INTERPOLATION_RANGE, but sometimes it may be less than that + *       for NONE repeat when handling fuzzy antialiased top or bottom image + *       edges. Also both top and bottom weight variables are guaranteed to + *       have value, which is less than BILINEAR_INTERPOLATION_RANGE. + *       For example, the weights can fit into unsigned byte or be used + *       with 8-bit SIMD multiplication instructions for 8-bit interpolation + *       precision.   */  #define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\  				  dst_type_t, repeat_mode, flags)				\ @@ -877,18 +892,18 @@ fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp,  	}											\  												\  	y1 = pixman_fixed_to_int (vy);								\ -	weight2 = (vy >> 8) & 0xff;								\ +	weight2 = pixman_fixed_to_bilinear_weight (vy);						\  	if (weight2)										\  	{											\ -	    /* normal case, both row weights are in 0-255 range and fit unsigned byte */	\ +	    /* both weight1 and weight2 are smaller than BILINEAR_INTERPOLATION_RANGE */	\  	    y2 = y1 + 1;									\ -	    weight1 = 256 - weight2;								\ +	    weight1 = BILINEAR_INTERPOLATION_RANGE - weight2;					\  	}											\  	else											\  	{											\ -	    /* set both top and bottom row to the same scanline, and weights to 128+128 */	\ +	    /* set both top and bottom row to the same scanline and tweak weights */		\  	    y2 = y1;										\ -	    weight1 = weight2 = 128;								\ +	    weight1 = weight2 = BILINEAR_INTERPOLATION_RANGE / 2;				\  	}											\  	vy += unit_y;										\  	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\ diff --git a/pixman/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman/pixman-mips-dspr2-asm.S index 87558f032..48f108ed9 100644 --- a/pixman/pixman/pixman-mips-dspr2-asm.S +++ b/pixman/pixman/pixman-mips-dspr2-asm.S @@ -29,6 +29,7 @@   * Author:  Nemanja Lukic (nlukic@mips.com)   */ +#include "pixman-private.h"  #include "pixman-mips-dspr2-asm.h"  LEAF_MIPS_DSPR2(pixman_fill_buff16_mips) @@ -771,11 +772,15 @@ LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_mips)      lw       s1, 48(sp)        /* s1 = wb */      lw       s2, 52(sp)        /* s2 = vx */      lw       s3, 56(sp)        /* s3 = unit_x */ -    li       v0, 256 +    li       v0, BILINEAR_INTERPOLATION_RANGE      li       s8, 0x00ff00ff + +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +  0:      andi     t4, s2, 0xffff    /* t4 = (short)vx */ -    srl      t4, t4, 8         /* t4 = vx >> 8 */ +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */      subu     t5, v0, t4        /* t5 = ( 256 - (vx>>8)) */      mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */ diff --git a/pixman/pixman/pixman-mmx.c b/pixman/pixman/pixman-mmx.c index 7c1f4fe24..b3ef2495b 100644 --- a/pixman/pixman/pixman-mmx.c +++ b/pixman/pixman/pixman-mmx.c @@ -42,6 +42,7 @@  #endif  #include "pixman-private.h"  #include "pixman-combine32.h" +#include "pixman-inlines.h"  #define no_vERBOSE @@ -718,6 +719,24 @@ combine (const uint32_t *src, const uint32_t *mask)      return vsrc;  } +static force_inline __m64 +core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst) +{ +    vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ()); + +    if (is_opaque (vsrc)) +    { +	return vsrc; +    } +    else if (!is_zero (vsrc)) +    { +	return over (vsrc, expand_alpha (vsrc), +		     _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ())); +    } + +    return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()); +} +  static void  mmx_combine_over_u (pixman_implementation_t *imp,                      pixman_op_t              op, @@ -1623,9 +1642,7 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,      PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);      mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); -    mask &= 0xff000000; -    mask = mask | mask >> 8 | mask >> 16 | mask >> 24; -    vmask = load8888 (&mask); +    vmask = expand_alpha (load8888 (&mask));      while (height--)      { @@ -1694,9 +1711,7 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,      PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);      mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); -    mask &= 0xff000000; -    mask = mask | mask >> 8 | mask >> 16 | mask >> 24; -    vmask = load8888 (&mask); +    vmask = expand_alpha (load8888 (&mask));      srca = MC (4x00ff);      while (height--) @@ -3532,6 +3547,242 @@ mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,      _mm_empty ();  } +#define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS)) +#define BMSK (BSHIFT - 1) + +#define BILINEAR_DECLARE_VARIABLES						\ +    const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);				\ +    const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);				\ +    const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT);	\ +    const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1);				\ +    const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK);			\ +    const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);		\ +    const __m64 mm_zero = _mm_setzero_si64 ();					\ +    __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx) + +#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\ +do {										\ +    /* fetch 2x2 pixel block into 2 mmx registers */				\ +    __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]);		\ +    __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]);		\ +    vx += unit_x;								\ +    /* vertical interpolation */						\ +    __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt);		\ +    __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt);		\ +    __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb);		\ +    __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);		\ +    __m64 hi = _mm_add_pi16 (t_hi, b_hi);					\ +    __m64 lo = _mm_add_pi16 (t_lo, b_lo);					\ +    if (BILINEAR_INTERPOLATION_BITS < 8)					\ +    {										\ +	/* calculate horizontal weights */					\ +	__m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7,		\ +			  _mm_srli_pi16 (mm_x,					\ +					 16 - BILINEAR_INTERPOLATION_BITS)));	\ +	mm_x = _mm_add_pi16 (mm_x, mm_ux);					\ +	/* horizontal interpolation */						\ +	__m64 p = _mm_unpacklo_pi16 (lo, hi);					\ +	__m64 q = _mm_unpackhi_pi16 (lo, hi);					\ +	lo = _mm_madd_pi16 (p, mm_wh);						\ +	hi = _mm_madd_pi16 (q, mm_wh);						\ +    }										\ +    else									\ +    {										\ +	/* calculate horizontal weights */					\ +	__m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x,		\ +					16 - BILINEAR_INTERPOLATION_BITS));	\ +	__m64 mm_wh_hi = _mm_srli_pi16 (mm_x,					\ +					16 - BILINEAR_INTERPOLATION_BITS);	\ +	mm_x = _mm_add_pi16 (mm_x, mm_ux);					\ +	/* horizontal interpolation */						\ +	__m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo);				\ +	__m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi);				\ +	__m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo);				\ +	__m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi);				\ +	lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo),		\ +			   _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi));		\ +	hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo),		\ +			   _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi));		\ +    }										\ +    /* shift and pack the result */						\ +    hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2);			\ +    lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2);			\ +    lo = _mm_packs_pi32 (lo, hi);						\ +    lo = _mm_packs_pu16 (lo, lo);						\ +    pix = lo;									\ +} while (0) + +#define BILINEAR_SKIP_ONE_PIXEL()						\ +do {										\ +    vx += unit_x;								\ +    mm_x = _mm_add_pi16 (mm_x, mm_ux);						\ +} while(0) + +static force_inline void +scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t *       dst, +					    const uint32_t * mask, +					    const uint32_t * src_top, +					    const uint32_t * src_bottom, +					    int32_t          w, +					    int              wt, +					    int              wb, +					    pixman_fixed_t   vx, +					    pixman_fixed_t   unit_x, +					    pixman_fixed_t   max_vx, +					    pixman_bool_t    zero_src) +{ +    BILINEAR_DECLARE_VARIABLES; +    __m64 pix; + +    while (w--) +    { +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix); +	store (dst, pix); +	dst++; +    } + +    _mm_empty (); +} + +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC, +			       scaled_bilinear_scanline_mmx_8888_8888_SRC, +			       uint32_t, uint32_t, uint32_t, +			       COVER, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC, +			       scaled_bilinear_scanline_mmx_8888_8888_SRC, +			       uint32_t, uint32_t, uint32_t, +			       PAD, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC, +			       scaled_bilinear_scanline_mmx_8888_8888_SRC, +			       uint32_t, uint32_t, uint32_t, +			       NONE, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC, +			       scaled_bilinear_scanline_mmx_8888_8888_SRC, +			       uint32_t, uint32_t, uint32_t, +			       NORMAL, FLAG_NONE) + +static force_inline void +scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t *       dst, +					     const uint32_t * mask, +					     const uint32_t * src_top, +					     const uint32_t * src_bottom, +					     int32_t          w, +					     int              wt, +					     int              wb, +					     pixman_fixed_t   vx, +					     pixman_fixed_t   unit_x, +					     pixman_fixed_t   max_vx, +					     pixman_bool_t    zero_src) +{ +    BILINEAR_DECLARE_VARIABLES; +    __m64 pix1, pix2; + +    while (w) +    { +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + +	if (!is_zero (pix1)) +	{ +	    pix2 = load (dst); +	    store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2)); +	} + +	w--; +	dst++; +    } + +    _mm_empty (); +} + +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER, +			       scaled_bilinear_scanline_mmx_8888_8888_OVER, +			       uint32_t, uint32_t, uint32_t, +			       COVER, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER, +			       scaled_bilinear_scanline_mmx_8888_8888_OVER, +			       uint32_t, uint32_t, uint32_t, +			       PAD, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER, +			       scaled_bilinear_scanline_mmx_8888_8888_OVER, +			       uint32_t, uint32_t, uint32_t, +			       NONE, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER, +			       scaled_bilinear_scanline_mmx_8888_8888_OVER, +			       uint32_t, uint32_t, uint32_t, +			       NORMAL, FLAG_NONE) + +static force_inline void +scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t *       dst, +					       const uint8_t  * mask, +					       const uint32_t * src_top, +					       const uint32_t * src_bottom, +					       int32_t          w, +					       int              wt, +					       int              wb, +					       pixman_fixed_t   vx, +					       pixman_fixed_t   unit_x, +					       pixman_fixed_t   max_vx, +					       pixman_bool_t    zero_src) +{ +    BILINEAR_DECLARE_VARIABLES; +    __m64 pix1, pix2; +    uint32_t m; + +    while (w) +    { +	m = (uint32_t) *mask++; + +	if (m) +	{ +	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + +	    if (m == 0xff && is_opaque (pix1)) +	    { +		store (dst, pix1); +	    } +	    else +	    { +		__m64 ms, md, ma, msa; + +		pix2 = load (dst); +		ma = expand_alpha_rev (to_m64 (m)); +		ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ()); +		md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ()); + +		msa = expand_alpha (ms); + +		store8888 (dst, (in_over (ms, msa, ma, md))); +	    } +	} +	else +	{ +	    BILINEAR_SKIP_ONE_PIXEL (); +	} + +	w--; +	dst++; +    } + +    _mm_empty (); +} + +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER, +			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER, +			       uint32_t, uint8_t, uint32_t, +			       COVER, FLAG_HAVE_NON_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER, +			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER, +			       uint32_t, uint8_t, uint32_t, +			       PAD, FLAG_HAVE_NON_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER, +			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER, +			       uint32_t, uint8_t, uint32_t, +			       NONE, FLAG_HAVE_NON_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER, +			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER, +			       uint32_t, uint8_t, uint32_t, +			       NORMAL, FLAG_HAVE_NON_SOLID_MASK) +  static uint32_t *  mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)  { @@ -3787,6 +4038,23 @@ static const pixman_fast_path_t mmx_fast_paths[] =      PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),      PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ), +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          a8r8g8b8, mmx_8888_8888                     ), +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ), +    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ), +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          a8b8g8r8, mmx_8888_8888                     ), +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ), +    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ), + +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         x8r8g8b8, mmx_8888_8888                     ), +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         x8b8g8r8, mmx_8888_8888                     ), +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         a8r8g8b8, mmx_8888_8888                     ), +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         a8b8g8r8, mmx_8888_8888                     ), + +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888                   ), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888                   ), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888                   ), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888                   ), +      { PIXMAN_OP_NONE },  }; diff --git a/pixman/pixman/pixman-private.h b/pixman/pixman/pixman-private.h index 72e3b4f6d..0c27798b0 100644 --- a/pixman/pixman/pixman-private.h +++ b/pixman/pixman/pixman-private.h @@ -1,10 +1,24 @@ +#ifndef PIXMAN_PRIVATE_H +#define PIXMAN_PRIVATE_H + +/* + * The defines which are shared between C and assembly code + */ + +/* bilinear interpolation precision (must be <= 8) */ +#define BILINEAR_INTERPOLATION_BITS 7 +#define BILINEAR_INTERPOLATION_RANGE (1 << BILINEAR_INTERPOLATION_BITS) + +/* + * C specific part + */ + +#ifndef __ASSEMBLER__ +  #ifndef PACKAGE  #  error config.h must be included before pixman-private.h  #endif -#ifndef PIXMAN_PRIVATE_H -#define PIXMAN_PRIVATE_H -  #define PIXMAN_DISABLE_DEPRECATED  #define PIXMAN_USE_INTERNAL_API @@ -1052,4 +1066,6 @@ void pixman_timer_register (pixman_timer_t *timer);  #endif /* PIXMAN_TIMERS */ +#endif /* __ASSEMBLER__ */ +  #endif /* PIXMAN_PRIVATE_H */ diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c index b656d17d4..ba067bc31 100644 --- a/pixman/pixman/pixman-sse2.c +++ b/pixman/pixman/pixman-sse2.c @@ -5364,11 +5364,15 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,  			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,  			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) +#define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1) +  #define BILINEAR_DECLARE_VARIABLES						\      const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\      const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\ -    const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);\ -    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\ +    const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\ +    const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\ +    const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, BMSK);\ +    const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\      const __m128i xmm_ux = _mm_set_epi16 (unit_x&0xffff, unit_x&0xffff, unit_x&0xffff, unit_x&0xffff,	\  					  unit_x&0xffff, unit_x&0xffff, unit_x&0xffff, unit_x&0xffff);	\      const __m128i xmm_zero = _mm_setzero_si128 ();				\ @@ -5388,18 +5392,30 @@ do {										\  					xmm_wt),				\  		       _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero),	\  					xmm_wb));				\ -    /* calculate horizontal weights */						\ -    xmm_wh = _mm_add_epi16 (xmm_addc,						\ -			    _mm_xor_si128 (xmm_xorc,				\ -					   _mm_srli_epi16 (xmm_x, 8)));		\ -    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\ -    /* horizontal interpolation */						\ -    xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\ -    xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);					\ -    a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),			\ -		       _mm_unpackhi_epi16 (xmm_lo, xmm_hi));			\ +    if (BILINEAR_INTERPOLATION_BITS < 8)					\ +    {										\ +	/* calculate horizontal weights */					\ +	xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7,		\ +		   _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));	\ +	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\ +	/* horizontal interpolation */						\ +	a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\ +		a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh);			\ +    }										\ +    else									\ +    {										\ +	/* calculate horizontal weights */					\ +	xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8,		\ +		_mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));	\ +	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\ +	/* horizontal interpolation */						\ +	xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\ +	xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);					\ +	a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),			\ +			   _mm_unpackhi_epi16 (xmm_lo, xmm_hi));		\ +    }										\      /* shift and pack the result */						\ -    a = _mm_srli_epi32 (a, 16);							\ +    a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2);			\      a = _mm_packs_epi32 (a, a);							\      a = _mm_packus_epi16 (a, a);						\      pix = _mm_cvtsi128_si32 (a);						\ @@ -5845,6 +5861,9 @@ static const pixman_fast_path_t sse2_fast_paths[] =      SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),      SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),      SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888), +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888), +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888), +    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),      SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),      SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), diff --git a/pixman/test/affine-test.c b/pixman/test/affine-test.c index a4ceed3da..6827cc3a8 100644 --- a/pixman/test/affine-test.c +++ b/pixman/test/affine-test.c @@ -301,11 +301,21 @@ test_composite (int      testnum,      return crc32;  } +#if BILINEAR_INTERPOLATION_BITS == 8 +#define CHECKSUM 0x1EF2175A +#elif BILINEAR_INTERPOLATION_BITS == 7 +#define CHECKSUM 0x74050F50 +#elif BILINEAR_INTERPOLATION_BITS == 4 +#define CHECKSUM 0x4362EAE8 +#else +#define CHECKSUM 0x00000000 +#endif +  int  main (int argc, const char *argv[])  {      pixman_disable_out_of_bounds_workaround (); -    return fuzzer_test_main ("affine", 8000000, 0x1EF2175A, +    return fuzzer_test_main ("affine", 8000000, CHECKSUM,  			     test_composite, argc, argv);  } diff --git a/pixman/test/scaling-test.c b/pixman/test/scaling-test.c index 6f2da1432..44c4f3de4 100644 --- a/pixman/test/scaling-test.c +++ b/pixman/test/scaling-test.c @@ -357,11 +357,21 @@ test_composite (int      testnum,      return crc32;  } +#if BILINEAR_INTERPOLATION_BITS == 8 +#define CHECKSUM 0x80DF1CB2 +#elif BILINEAR_INTERPOLATION_BITS == 7 +#define CHECKSUM 0x2818D5FB +#elif BILINEAR_INTERPOLATION_BITS == 4 +#define CHECKSUM 0x387540A5 +#else +#define CHECKSUM 0x00000000 +#endif +  int  main (int argc, const char *argv[])  {      pixman_disable_out_of_bounds_workaround (); -    return fuzzer_test_main("scaling", 8000000, 0x80DF1CB2, +    return fuzzer_test_main("scaling", 8000000, CHECKSUM,  			    test_composite, argc, argv);  } diff --git a/xorg-server/randr/rrscreen.c b/xorg-server/randr/rrscreen.c index 55110e088..c564d1f96 100644 --- a/xorg-server/randr/rrscreen.c +++ b/xorg-server/randr/rrscreen.c @@ -195,7 +195,7 @@ ProcRRGetScreenSizeRange(ClientPtr client)      rrScrPrivPtr pScrPriv;      int rc; -    REQUEST_SIZE_MATCH(xRRGetScreenInfoReq); +    REQUEST_SIZE_MATCH(xRRGetScreenSizeRangeReq);      rc = dixLookupWindow(&pWin, stuff->window, client, DixGetAttrAccess);      if (rc != Success)          return rc; | 
