5 files changed, 133 insertions, 12 deletions
diff --git a/mesalib/src/gallium/auxiliary/util/u_cpu_detect.c b/mesalib/src/gallium/auxiliary/util/u_cpu_detect.c
index 588fc7c72..87ad78095 100644
--- a/mesalib/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/mesalib/src/gallium/auxiliary/util/u_cpu_detect.c
@@ -230,8 +230,28 @@ static INLINE uint64_t xgetbv(void)
 #else
    return 0;
 #endif
+}
+
 
+#if defined(PIPE_ARCH_X86)
+static INLINE boolean sse2_has_daz(void)
+{
+   struct {
+      uint32_t pad1[7];
+      uint32_t mxcsr_mask;
+      uint32_t pad2[128-8];
+   } PIPE_ALIGN_VAR(16) fxarea;
+
+   fxarea.mxcsr_mask = 0;
+#if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO))
+   __asm __volatile ("fxsave %0" : "+m" (fxarea));
+#elif (defined(PIPE_CC_MSVC) || defined(PIPE_CC_ICL))
+   _fxsave(&fxarea);
+#endif
+   return !!(fxarea.mxcsr_mask & (1 << 6));
 }
+#endif
+
 #endif /* X86 or X86_64 */
 
 void
@@ -310,6 +330,12 @@ util_cpu_detect(void)
                                     ((xgetbv() & 6) == 6);    // XMM & YMM
          util_cpu_caps.has_f16c   = (regs2[2] >> 29) & 1;
          util_cpu_caps.has_mmx2   = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */
+#if defined(PIPE_ARCH_X86_64)
+         util_cpu_caps.has_daz = 1;
+#else
+         util_cpu_caps.has_daz = util_cpu_caps.has_sse3 ||
+            (util_cpu_caps.has_sse2 && sse2_has_daz());
+#endif
 
          cacheline = ((regs2[1] >> 8) & 0xFF) * 8;
          if (cacheline > 0)
@@ -368,9 +394,12 @@ util_cpu_detect(void)
       debug_printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1);
       debug_printf("util_cpu_caps.has_sse4_2 = %u\n", util_cpu_caps.has_sse4_2);
       debug_printf("util_cpu_caps.has_avx = %u\n", util_cpu_caps.has_avx);
+      debug_printf("util_cpu_caps.has_f16c = %u\n", util_cpu_caps.has_f16c);
+      debug_printf("util_cpu_caps.has_popcnt = %u\n", util_cpu_caps.has_popcnt);
       debug_printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow);
       debug_printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext);
       debug_printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec);
+      debug_printf("util_cpu_caps.has_daz = %u\n", util_cpu_caps.has_daz);
    }
 #endif
 
diff --git a/mesalib/src/gallium/auxiliary/util/u_cpu_detect.h b/mesalib/src/gallium/auxiliary/util/u_cpu_detect.h
index f9cd6475e..cc3e0ce03 100644
--- a/mesalib/src/gallium/auxiliary/util/u_cpu_detect.h
+++ b/mesalib/src/gallium/auxiliary/util/u_cpu_detect.h
@@ -68,6 +68,7 @@ struct util_cpu_caps {
    unsigned has_3dnow:1;
    unsigned has_3dnow_ext:1;
    unsigned has_altivec:1;
+   unsigned has_daz:1;
 };
 
 extern struct util_cpu_caps
diff --git a/mesalib/src/gallium/auxiliary/util/u_format_srgb.h b/mesalib/src/gallium/auxiliary/util/u_format_srgb.h
index 82ed9575d..740a91974 100644
--- a/mesalib/src/gallium/auxiliary/util/u_format_srgb.h
+++ b/mesalib/src/gallium/auxiliary/util/u_format_srgb.h
@@ -39,6 +39,7 @@
 
 
 #include "pipe/p_compiler.h"
+#include "u_pack_color.h"
 #include "u_math.h"
 
 
@@ -51,23 +52,58 @@ util_format_srgb_to_linear_8unorm_table[256];
 extern const uint8_t
 util_format_linear_to_srgb_8unorm_table[256];
 
+extern const unsigned
+util_format_linear_to_srgb_helper_table[104];
+
 
 /**
  * Convert a unclamped linear float to srgb value in the [0,255].
- * XXX this hasn't been tested (render to srgb surface).
- * XXX this needs optimization.
  */
 static INLINE uint8_t
 util_format_linear_float_to_srgb_8unorm(float x)
 {
-   if (x >= 1.0f)
-      return 255;
-   else if (x >= 0.0031308f)
-      return float_to_ubyte(1.055f * powf(x, 0.41666f) - 0.055f);
-   else if (x > 0.0f)
-      return float_to_ubyte(12.92f * x);
-   else
-      return 0;
+   /* this would be exact but (probably much) slower */
+   if (0) {
+      if (x >= 1.0f)
+         return 255;
+      else if (x >= 0.0031308f)
+         return float_to_ubyte(1.055f * powf(x, 0.41666666f) - 0.055f);
+      else if (x > 0.0f)
+         return float_to_ubyte(12.92f * x);
+      else
+         return 0;
+   }
+   else {
+      /*
+       * This is taken from https://gist.github.com/rygorous/2203834
+       * Use LUT and do linear interpolation.
+       */
+      union fi almostone, minval, f;
+      unsigned tab, bias, scale, t;
+
+      almostone.ui = 0x3f7fffff;
+      minval.ui = (127-13) << 23;
+
+      /*
+       * Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
+       * The tests are carefully written so that NaNs map to 0, same as in the
+       * reference implementation.
+       */
+      if (!(x > minval.f))
+         x = minval.f;
+      if (x > almostone.f)
+         x = almostone.f;
+
+      /* Do the table lookup and unpack bias, scale */
+      f.f = x;
+      tab = util_format_linear_to_srgb_helper_table[(f.ui - minval.ui) >> 20];
+      bias = (tab >> 16) << 9;
+      scale = tab & 0xffff;
+
+      /* Grab next-highest mantissa bits and perform linear interpolation */
+      t = (f.ui >> 12) & 0xff;
+      return (uint8_t) ((bias + scale*t) >> 16);
+   }
 }
 
 
diff --git a/mesalib/src/gallium/auxiliary/util/u_format_srgb.py b/mesalib/src/gallium/auxiliary/util/u_format_srgb.py
index cd63ae789..c6c02f053 100644
--- a/mesalib/src/gallium/auxiliary/util/u_format_srgb.py
+++ b/mesalib/src/gallium/auxiliary/util/u_format_srgb.py
@@ -40,6 +40,7 @@ CopyRight = '''
 
 
 import math
+import struct
 
 
 def srgb_to_linear(x):
@@ -51,10 +52,11 @@ def srgb_to_linear(x):
 
 def linear_to_srgb(x):
     if x >= 0.0031308:
-        return 1.055 * math.pow(x, 0.41666) - 0.055
+        return 1.055 * math.pow(x, 0.41666666) - 0.055
     else:
         return 12.92 * x
 
+
 def generate_srgb_tables():
     print 'const float'
     print 'util_format_srgb_8unorm_to_linear_float_table[256] = {'
@@ -84,6 +86,59 @@ def generate_srgb_tables():
     print '};'
     print
 
+# calculate the table interpolation values used in float linear to unorm8 srgb
+    numexp = 13
+    mantissa_msb = 3
+# stepshift is just used to only use every x-th float to make things faster,
+# 5 is largest value which still gives exact same table as 0
+    stepshift = 5
+    nbuckets = numexp << mantissa_msb
+    bucketsize = (1 << (23 - mantissa_msb)) >> stepshift
+    mantshift = 12
+    valtable = []
+    sum_aa = float(bucketsize)
+    sum_ab = 0.0
+    sum_bb = 0.0
+    for i in range(0, bucketsize):
+        j = (i << stepshift) >> mantshift
+        sum_ab += j
+        sum_bb += j*j
+    inv_det = 1.0 / (sum_aa * sum_bb - sum_ab * sum_ab)
+
+    for bucket in range(0, nbuckets):
+        start = ((127 - numexp) << 23) + bucket*(bucketsize << stepshift)
+        sum_a = 0.0
+        sum_b = 0.0
+ 
+        for i in range(0, bucketsize):
+            j = (i << stepshift) >> mantshift
+            fint = start + (i << stepshift)
+            ffloat = struct.unpack('f', struct.pack('I', fint))[0]
+            val = linear_to_srgb(ffloat) * 255.0 + 0.5
+            sum_a += val
+            sum_b += j*val
+
+        solved_a = inv_det * (sum_bb*sum_a - sum_ab*sum_b)
+        solved_b = inv_det * (sum_aa*sum_b - sum_ab*sum_a)
+
+        scaled_a = solved_a * 65536.0 / 512.0
+        scaled_b = solved_b * 65536.0
+ 
+        int_a = int(scaled_a + 0.5)
+        int_b = int(scaled_b + 0.5)
+
+        valtable.append((int_a << 16) + int_b)
+
+    print 'const unsigned'
+    print 'util_format_linear_to_srgb_helper_table[104] = {'
+
+    for j in range(0, nbuckets, 4):
+        print '   ',
+        for i in range(j, j + 4):
+            print '0x%08x,' % (valtable[i],),
+        print
+    print '};'
+    print
 
 def main():
     print '/* This file is autogenerated by u_format_srgb.py. Do not edit directly. */'
diff --git a/mesalib/src/gallium/auxiliary/util/u_math.c b/mesalib/src/gallium/auxiliary/util/u_math.c
index f3fe392ba..6981ee939 100644
--- a/mesalib/src/gallium/auxiliary/util/u_math.c
+++ b/mesalib/src/gallium/auxiliary/util/u_math.c
@@ -111,7 +111,7 @@ util_fpstate_set_denorms_to_zero(unsigned current_mxcsr)
    if (util_cpu_caps.has_sse) {
       /* Enable flush to zero mode */
       current_mxcsr |= _MM_FLUSH_ZERO_MASK;
-      if (util_cpu_caps.has_sse3) {
+      if (util_cpu_caps.has_daz) {
          /* Enable denormals are zero mode */
          current_mxcsr |= _MM_DENORMALS_ZERO_MASK;
       }