Hello community,

here is the log from the commit of package babl for openSUSE:Factory checked in 
at 2018-05-25 21:35:56
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/babl (Old)
 and      /work/SRC/openSUSE:Factory/.babl.new (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Package is "babl"

Fri May 25 21:35:56 2018 rev:41 rq:610968 version:0.1.50

Changes:
--------
--- /work/SRC/openSUSE:Factory/babl/babl.changes        2018-05-13 
15:53:34.871473053 +0200
+++ /work/SRC/openSUSE:Factory/.babl.new/babl.changes   2018-05-25 
21:35:57.290921426 +0200
@@ -1,0 +2,14 @@
+Mon May 21 00:47:05 UTC 2018 - [email protected]
+
+- Improvements to speed and precision of indexed code, 
+  improvements to mesonbuild.
+
+-------------------------------------------------------------------
+Wed May 16 18:21:48 UTC 2018 - [email protected]
+
+- Update to version 0.1.48:
+  + Fix u8 <-> double conversions for chroma, SSE2 version of RGBA
+    float to CIE L / Lab.
+  + Build with -Ofast by default.
+
+-------------------------------------------------------------------

Old:
----
  babl-0.1.46.tar.bz2

New:
----
  babl-0.1.50.tar.bz2

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ babl.spec ++++++
--- /var/tmp/diff_new_pack.Kl9eNQ/_old  2018-05-25 21:35:57.930898263 +0200
+++ /var/tmp/diff_new_pack.Kl9eNQ/_new  2018-05-25 21:35:57.930898263 +0200
@@ -18,7 +18,7 @@
 
 %define debug_package_requires libbabl-0_1-0 = %{version}-%{release}
 Name:           babl
-Version:        0.1.46
+Version:        0.1.50
 Release:        0
 Summary:        Dynamic Pixel Format Translation Library
 License:        LGPL-3.0-or-later

++++++ babl-0.1.46.tar.bz2 -> babl-0.1.50.tar.bz2 ++++++
++++ 4680 lines of diff (skipped)
++++    retrying with extended exclude list
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' --exclude Makefile.in --exclude configure --exclude 
config.guess --exclude '*.pot' --exclude mkinstalldirs --exclude aclocal.m4 
--exclude config.sub --exclude depcomp --exclude install-sh --exclude ltmain.sh 
old/babl-0.1.46/INSTALL new/babl-0.1.50/INSTALL
--- old/babl-0.1.46/INSTALL     2018-04-10 23:11:28.000000000 +0200
+++ new/babl-0.1.50/INSTALL     2018-05-20 10:48:54.000000000 +0200
@@ -1,5 +1,5 @@
 
-babl 0.1.46
+babl 0.1.50
 
     Dynamic; any to any, pixel format conversion library.
 
@@ -12,10 +12,10 @@
 installation (or a variation on this theme):
 
      ------------------------------------------------------------
-     foo$ wget ftp://ftp.gtk.org/pub/babl/0.1/babl-0.1.46.tar.bz2
-     foo$ tar jxf babl-0.1.46.tar.gz
-     foo$ cd babl-0.1.46
-     foo/babl-0.1.46$ ./configure && make && sudo make install
+     foo$ wget ftp://ftp.gtk.org/pub/babl/0.1/babl-0.1.50.tar.bz2
+     foo$ tar jxf babl-0.1.50.tar.gz
+     foo$ cd babl-0.1.50
+     foo/babl-0.1.50$ ./configure && make && sudo make install
      ------------------------------------------------------------
 
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' --exclude Makefile.in --exclude configure --exclude 
config.guess --exclude '*.pot' --exclude mkinstalldirs --exclude aclocal.m4 
--exclude config.sub --exclude depcomp --exclude install-sh --exclude ltmain.sh 
old/babl-0.1.46/NEWS new/babl-0.1.50/NEWS
--- old/babl-0.1.46/NEWS        2018-04-10 23:08:46.000000000 +0200
+++ new/babl-0.1.50/NEWS        2018-05-20 10:45:34.000000000 +0200
@@ -3,7 +3,14 @@
 the news section both in the README and the webpage.
                                                                           -->
 
-
+2018-05-20 babl-0.1.50                                              </dt><dd>
+Improvements to speed and precision of indexed code, improvements to meson
+build.
+                                                                    </dd><dt>
+2018-05-15 babl-0.1.48                                              </dt><dd>
+fix u8 &lt;-&gt; double conversions for chroma, SSE2 version of RGBA float to
+CIE L / Lab. Build with -Ofast by default.
+                                                                    </dd><dt>
 2018-04-10 babl-0.1.46                                              </dt><dd>
 added extensions with more coverage for u32, half and other utilit fast paths
 improving fast path coverage.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' --exclude Makefile.in --exclude configure --exclude 
config.guess --exclude '*.pot' --exclude mkinstalldirs --exclude aclocal.m4 
--exclude config.sub --exclude depcomp --exclude install-sh --exclude ltmain.sh 
old/babl-0.1.46/README new/babl-0.1.50/README
--- old/babl-0.1.46/README      2018-04-10 23:12:04.000000000 +0200
+++ new/babl-0.1.50/README      2018-05-20 10:49:19.000000000 +0200
@@ -1,4 +1,4 @@
-Babl-0.1.46
+Babl-0.1.50
 
 Contents
 
@@ -63,6 +63,12 @@
 release is done a babl release is most often put out just prior to the
 GEGL release.
 
+2018-05-20 babl-0.1.50
+    Improvements to speed and precision of indexed code, improvements
+    to meson build.
+2018-05-15 babl-0.1.48
+    fix u8 <-> double conversions for chroma, SSE2 version of RGBA
+    float to CIE L / Lab. Build with -Ofast by default.
 2018-04-10 babl-0.1.46
     added extensions with more coverage for u32, half and other utilit
     fast paths improving fast path coverage.
@@ -1643,19 +1649,6 @@
 
     u8  Y
 
-cairo-ARGB32
-
-bytes/pixel
-    4
-model
-    R'aG'aB'aA
-components
-
-    u8 B'a
-    u8 G'a
-    u8 R'a
-    u8 A
-
 cairo-RGB24
 
 bytes/pixel
@@ -1977,6 +1970,19 @@
     float saturation
     float lightness
 
+cairo-ARGB32
+
+bytes/pixel
+    4
+model
+    R'aG'aB'aA
+components
+
+    u8 B'a
+    u8 G'a
+    u8 R'a
+    u8 A
+
 cairo-A8
 
 bytes/pixel
@@ -2281,4 +2287,4 @@
 Félix Piédallu
     Initial meson build
 
-/babl-0.1.46
+/babl-0.1.50
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' --exclude Makefile.in --exclude configure --exclude 
config.guess --exclude '*.pot' --exclude mkinstalldirs --exclude aclocal.m4 
--exclude config.sub --exclude depcomp --exclude install-sh --exclude ltmain.sh 
old/babl-0.1.46/babl/babl-icc.c new/babl-0.1.50/babl/babl-icc.c
--- old/babl-0.1.46/babl/babl-icc.c     2017-10-25 16:48:41.000000000 +0200
+++ new/babl-0.1.50/babl/babl-icc.c     2018-04-25 20:22:04.000000000 +0200
@@ -342,7 +342,7 @@
                 fprintf (stderr, "%f %f %f %f %f %f %f\n",
                               g, a, b, c, d, e, f);
             {
-              fprintf (stdout, "unhandled parametric sRGB formula TRC type 
%i\n", function_type);
+              fprintf (stderr, "unhandled parametric sRGB formula TRC type 
%i\n", function_type);
               *error = "unhandled sRGB formula like TRC";
               return babl_trc_gamma (2.2);
             }
@@ -350,7 +350,7 @@
               break;
             default:
               *error = "unhandled parametric TRC";
-              fprintf (stdout, "unhandled parametric TRC type %i\n", 
function_type);
+              fprintf (stderr, "unhandled parametric TRC type %i\n", 
function_type);
               return babl_trc_gamma (2.2);
             break;
          }
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' --exclude Makefile.in --exclude configure --exclude 
config.guess --exclude '*.pot' --exclude mkinstalldirs --exclude aclocal.m4 
--exclude config.sub --exclude depcomp --exclude install-sh --exclude ltmain.sh 
old/babl-0.1.46/babl/babl-image.c new/babl-0.1.50/babl/babl-image.c
--- old/babl-0.1.46/babl/babl-image.c   2017-09-21 21:06:11.000000000 +0200
+++ new/babl-0.1.50/babl/babl-image.c   2018-05-19 17:21:32.000000000 +0200
@@ -64,7 +64,7 @@
 
   babl->class_type       = BABL_IMAGE;
   babl->instance.id      = 0;
-  babl->instance.name    = "slaritbartfast";
+  babl->instance.name    = "slartibartfast";
   babl->image.format     = format;
   babl->image.model      = model;
   babl->image.components = components;
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' --exclude Makefile.in --exclude configure --exclude 
config.guess --exclude '*.pot' --exclude mkinstalldirs --exclude aclocal.m4 
--exclude config.sub --exclude depcomp --exclude install-sh --exclude ltmain.sh 
old/babl-0.1.46/babl/babl-internal.h new/babl-0.1.50/babl/babl-internal.h
--- old/babl-0.1.46/babl/babl-internal.h        2018-01-20 18:39:30.000000000 
+0100
+++ new/babl-0.1.50/babl/babl-internal.h        2018-04-25 20:20:56.000000000 
+0200
@@ -127,7 +127,7 @@
           __android_log_print (ANDROID_LOG_DEBUG, "BABL",
                                "When loading %s:\n\t", 
babl_extender()->instance.name);
 #else
-          fprintf (stdout, "When loading %s:\n\t", 
babl_extender()->instance.name);
+          fprintf (stderr, "When loading %s:\n\t", 
babl_extender()->instance.name);
 #endif
         }
 
@@ -135,7 +135,7 @@
       __android_log_print (ANDROID_LOG_DEBUG, "BABL",
                            "%s:%i %s()", file, line, function);
 #else
-      fprintf (stdout, "%s:%i %s()\n\t", file, line, function);
+      fprintf (stderr, "%s:%i %s()\n\t", file, line, function);
 #endif
     }
 
@@ -143,8 +143,8 @@
   __android_log_vprint (ANDROID_LOG_DEBUG, "BABL",
                         fmt, varg);
 #else
-  vfprintf (stdout, fmt, varg);
-  fprintf (stdout, "\n");
+  vfprintf (stderr, fmt, varg);
+  fprintf (stderr, "\n");
   fflush (NULL);
 #endif
   return;
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' --exclude Makefile.in --exclude configure --exclude 
config.guess --exclude '*.pot' --exclude mkinstalldirs --exclude aclocal.m4 
--exclude config.sub --exclude depcomp --exclude install-sh --exclude ltmain.sh 
old/babl-0.1.46/babl/babl-palette.c new/babl-0.1.50/babl/babl-palette.c
--- old/babl-0.1.46/babl/babl-palette.c 2018-01-03 17:01:19.000000000 +0100
+++ new/babl-0.1.50/babl/babl-palette.c 2018-05-18 15:32:14.000000000 +0200
@@ -19,6 +19,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
+#include <math.h>
 #include <limits.h>
 #include <assert.h>
 #include "config.h"
@@ -31,8 +32,8 @@
 /* A default palette, containing standard ANSI / EGA colors
  *
  */
-static unsigned char defpal_data[4*16] = 
-{  
+static unsigned char defpal_data[4*16] =
+{
 0  ,0  ,0  ,255,
 127,0  ,0  ,255,
 0  ,127,0  ,255,
@@ -52,6 +53,14 @@
 };
 static double defpal_double[4*8*16];
 
+static unsigned short ceil_sqrt_u8[3 * 255 * 255 + 1];
+
+
+typedef struct BablPaletteRadius
+{
+  unsigned char  idx;
+  unsigned short diff;
+} BablPaletteRadius;
 
 typedef struct BablPalette
 {
@@ -62,10 +71,88 @@
                                   */
   double                *data_double;
   unsigned char         *data_u8;
+  BablPaletteRadius     *radii;
   volatile unsigned int  hash[HASH_TABLE_SIZE];
 } BablPalette;
 
 static void
+init_ceil_sqrt_u8 (void)
+{
+  int i;
+
+  babl_mutex_lock (babl_format_mutex);
+
+  if (! ceil_sqrt_u8[1])
+    {
+      for (i = 0; i <= 3 * 255 * 255; i++)
+        ceil_sqrt_u8[i] = ceil (sqrt (i));
+    }
+
+  babl_mutex_unlock (babl_format_mutex);
+}
+
+static inline int
+diff2_u8 (const unsigned char *p1,
+          const unsigned char *p2)
+{
+  return ((int) p1[0] - (int) p2[0]) * ((int) p1[0] - (int) p2[0]) +
+         ((int) p1[1] - (int) p2[1]) * ((int) p1[1] - (int) p2[1]) +
+         ((int) p1[2] - (int) p2[2]) * ((int) p1[2] - (int) p2[2]);
+}
+
+static int
+babl_palette_radius_compare (const void *r1,
+                             const void *r2)
+{
+  const BablPaletteRadius *radius1 = r1;
+  const BablPaletteRadius *radius2 = r2;
+
+  return (int) radius1->diff - (int) radius2->diff;
+}
+
+static void
+babl_palette_init_radii (BablPalette *pal)
+{
+  int i, j;
+
+  init_ceil_sqrt_u8 ();
+
+  /* calculate the distance between each pair of colors in the palette, and, 
for
+   * each color, construct a list of all other colors and their distances from
+   * it, sorted by distance.  we use these lists in babl_palette_lookup() to
+   * speed up the search, as described in the function.
+   */
+
+  pal->radii = babl_malloc (sizeof (BablPaletteRadius) *
+                            (pal->count - 1)           *
+                            pal->count);
+
+  for (i = 0; i < pal->count; i++)
+    {
+      BablPaletteRadius   *radii1 = pal->radii + (pal->count - 1) * i;
+      const unsigned char *p1     = pal->data_u8 + 4 * i;
+
+      for (j = i + 1; j < pal->count; j++)
+        {
+          BablPaletteRadius   *radii2 = pal->radii + (pal->count - 1) * j;
+          const unsigned char *p2     = pal->data_u8 + 4 * j;
+          unsigned short       diff;
+
+          diff = floor (sqrt (diff2_u8 (p1, p2)));
+
+          radii1[j - 1].idx  = j;
+          radii1[j - 1].diff = diff;
+
+          radii2[i].idx      = i;
+          radii2[i].diff     = diff;
+        }
+
+      qsort (radii1, pal->count - 1, sizeof (BablPaletteRadius),
+             babl_palette_radius_compare);
+    }
+}
+
+static void
 babl_palette_reset_hash (BablPalette *pal)
 {
   int i;
@@ -75,10 +162,14 @@
     }
 }
 
+#define BABL_IDX_FACTOR 255.5
+
 static int
-babl_palette_lookup (BablPalette *pal, int r, int g, int b, int a)
+babl_palette_lookup (BablPalette         *pal,
+                     const unsigned char *p,
+                     int                  best_idx)
 {
-  unsigned int pixel      = (r << 16) | (g << 8) | b;
+  unsigned int pixel      = p[0] | (p[1] << 8) | (p[2] << 16);
   int          hash_index = pixel % HASH_TABLE_SIZE;
   unsigned int hash_value = pal->hash[hash_index];
   unsigned int hash_pixel = hash_value & 0x00ffffffu;
@@ -97,26 +188,60 @@
     }
   else
     {
-      int best_idx = 0;
-      int best_diff = INT_MAX;
+      const BablPaletteRadius *radii = pal->radii + (pal->count - 1) * 
best_idx;
+      const unsigned char     *q;
+      int                      best_diff2;
+      int                      best_diff;
+      int                      diff0;
+      int                      i;
+
+      /* best_idx is the closest palette entry to the previous pixel (referred
+       * to as the source color).  based on the assumption that nearby pixels
+       * have similar color, we start the search for the current closest entry
+       * at best_idx, and iterate over the entry's color list, as calculated in
+       * babl_palette_init_radii(), in search for a better match.
+       */
+
+      q          = pal->data_u8 + 4 * best_idx;
+      best_diff2 = diff2_u8 (p, q);
+      best_diff  = ceil_sqrt_u8[best_diff2];
+      diff0      = best_diff;
 
-      for (idx = 0; idx < pal->count; idx++)
+      for (i = 0; i < pal->count - 1; i++)
         {
-          unsigned char *palpx = pal->data_u8 + idx * 4;
-          int pr = palpx[0];
-          int pg = palpx[1];
-          int pb = palpx[2];
-
-          int diff = (r - pr) * (r - pr) +
-                     (g - pg) * (g - pg) +
-                     (b - pb) * (b - pb);
-          if (diff < best_diff)
+          const BablPaletteRadius *radius = &radii[i];
+          int                      min_diff;
+          int                      diff2;
+
+          /* radius->diff is the distance from the source color to the current
+           * color.  diff0 is the distance from the source color to the input
+           * color.  according to the triangle inequality, the distance from
+           * the current color to the input color is at least
+           * radius->diff - diff0.  if the shortest distance found so far is
+           * less than that, then the best match found so far is necessarily
+           * better than the current color, and we can stop the search, since
+           * the color list is sorted in ascending radius->diff order.
+           */
+
+          idx      = radius->idx;
+          min_diff = radius->diff - diff0;
+
+          if (best_diff < min_diff || (best_diff == min_diff && best_idx < 
idx))
+            break;
+
+          q     = pal->data_u8 + 4 * idx;
+          diff2 = diff2_u8 (p, q);
+
+          if (diff2 < best_diff2 || (diff2 == best_diff2 && idx < best_idx))
             {
-              best_diff = diff;
-              best_idx  = idx;
+              best_idx   = idx;
+              best_diff2 = diff2;
+              best_diff  = ceil_sqrt_u8[diff2];
             }
         }
+
       pal->hash[hash_index] = ((unsigned int) best_idx << 24) | pixel;
+
       return best_idx;
     }
 }
@@ -139,6 +264,8 @@
   babl_process (babl_fish (format, babl_format ("R'G'B'A u8")),
                 data, pal->data_u8, count);
 
+  babl_palette_init_radii (pal);
+
   babl_palette_reset_hash (pal);
 
   return pal;
@@ -149,6 +276,7 @@
   babl_free (pal->data);
   babl_free (pal->data_double);
   babl_free (pal->data_u8);
+  babl_free (pal->radii);
   babl_free (pal);
 }
 
@@ -172,93 +300,101 @@
   babl_process (babl_fish (pal.format, babl_format ("RGBA double")),
                 pal.data, pal.data_double, pal.count);
 
+  babl_palette_init_radii (&pal);
   babl_palette_reset_hash (&pal);
   return &pal;
 }
 
 static void
 rgba_to_pal (Babl *conversion,
-             char *src,
+             char *src_b,
              char *dst,
              long  n,
              void *dst_model_data)
 {
+  const Babl *space = babl_conversion_get_source_space (conversion);
   BablPalette **palptr = dst_model_data;
-  BablPalette *pal = *palptr;
+  BablPalette *pal;
+  int best_idx = 0;
+  assert (palptr);
+  pal = *palptr;
+  assert(pal);
+
   while (n--)
     {
-      int idx;
-
-      int best_idx = 0;
-      double best_diff = 100000;
-      double *srcf;
-
-      srcf = ((double *) src);
-
-      for (idx = 0; idx<pal->count; idx++)
-        {
-          double diff;
-          double *palpx = ((double *)pal->data_double) + idx * 4;
+      double *src_d = (void*) src_b;
+      unsigned char src[4];
+      int c;
+      for (c = 0; c < 3; c++)
+      {
+        if (src_d[c] >= 1.0f)
+          src[c] = 255;
+        else if (src_d[c] <= 0.0f)
+          src[c] = 0;
+        else
+          src[c] = babl_trc_from_linear (space->space.trc[0],
+                                         src_d[c]) * 255 + 0.5f;
+      }
+      if (src_d[3] >= 1.0f)
+        src[3] = 255;
+      else if (src_d[3] <= 0.0f)
+        src[3] = 0;
+      else
+        src[3] = src_d[3] * 255 + 0.5f;
 
-          diff = (palpx[0] - srcf[0]) * (palpx[0] - srcf[0]) +
-                 (palpx[1] - srcf[1]) * (palpx[1] - srcf[1]) +
-                 (palpx[2] - srcf[2]) * (palpx[2] - srcf[2]);
-          if (diff <= best_diff)
-            {
-              best_diff = diff;
-              best_idx = idx;
-            }
-        }
+      best_idx = babl_palette_lookup (pal, src, best_idx);
 
-      ((double *) dst)[0] = best_idx / 255.5;
+      ((double *) dst)[0] = best_idx / BABL_IDX_FACTOR;
 
-      src += sizeof (double) * 4;
+      src_b += sizeof (double) * 4;
       dst += sizeof (double) * 1;
     }
+
 }
 
 static void
 rgba_to_pala (Babl *conversion,
-              char *src,
+              char *src_i,
               char *dst,
               long  n,
               void *dst_model_data)
 {
+  const Babl *space = babl_conversion_get_destination_space (conversion);
   BablPalette **palptr = dst_model_data;
-  BablPalette *pal = *palptr;
-  
+  BablPalette *pal;
+  int best_idx = 0;
+  assert (palptr);
+  pal = *palptr;
   assert(pal);
+
   while (n--)
     {
-      int idx;
-
-      int best_idx = 0;
-      double best_diff = 100000;
-      double *srcf;
-      double alpha;
+      double *src_d = (void*) src_i;
+      unsigned char src[4];
+      int c;
+      for (c = 0; c < 3; c++)
+      {
+        if (src_d[c] >= 1.0f)
+          src[c] = 255;
+        else if (src_d[c] <= 0.0f)
+          src[c] = 0;
+        else
+          src[c] = babl_trc_from_linear (space->space.trc[0],
+                                         src_d[c]) * 255 + 0.5f;
+      }
+      if (src_d[3] >= 1.0f)
+        src[3] = 255;
+      else if (src_d[3] <= 0.0f)
+        src[3] = 0;
+      else
+        src[3] = src_d[3] * 255 + 0.5f;
 
-      srcf = ((double *) src);
-      alpha = srcf[3];
-
-      for (idx = 0; idx<pal->count; idx++)
-        {
-          double diff;
-          double *palpx = ((double *)pal->data_double) + idx * 4;
-
-          diff = (palpx[0] - srcf[0]) * (palpx[0] - srcf[0]) +
-                 (palpx[1] - srcf[1]) * (palpx[1] - srcf[1]) +
-                 (palpx[2] - srcf[2]) * (palpx[2] - srcf[2]);
-          if (diff <= best_diff)
-            {
-              best_diff = diff;
-              best_idx = idx;
-            }
-        }
+      best_idx = babl_palette_lookup (pal, src, best_idx);
 
-      ((double *) dst)[0] = best_idx / 255.5;
-      ((double *) dst)[1] = alpha;
+      ((double *) dst)[0] = best_idx / BABL_IDX_FACTOR;
+      ((double *) dst)[1] = src_d[3];
 
-      src += sizeof (double) * 4;
+      src_i += sizeof (double) * 4;
       dst += sizeof (double) * 2;
     }
 }
@@ -275,7 +411,7 @@
   assert(pal);
   while (n--)
     {
-      int idx = (((double *) src)[0]) * 255.5;
+      int idx = (((double *) src)[0]) * BABL_IDX_FACTOR;
       double *palpx;
 
       if (idx < 0) idx = 0;
@@ -302,7 +438,7 @@
   assert(pal);
   while (n--)
     {
-      int idx      = (((double *) src)[0]) * 255.5;
+      int idx      = (((double *) src)[0]) * BABL_IDX_FACTOR;
       double alpha = (((double *) src)[1]);
       double *palpx;
 
@@ -312,7 +448,7 @@
       palpx = ((double *)pal->data_double) + idx * 4;
       memcpy (dst, palpx, sizeof(double)*4);
 
-      ((double *)dst)[3] *= alpha; 
+      ((double *)dst)[3] *= alpha;
 
       src += sizeof (double) * 2;
       dst += sizeof (double) * 4;
@@ -320,6 +456,97 @@
 }
 
 static void
+rgba_float_to_pal_a (Babl          *conversion,
+                     unsigned char *src_b,
+                     unsigned char *dst,
+                     long           n,
+                     void          *src_model_data)
+{
+  const Babl *space = babl_conversion_get_destination_space (conversion);
+  BablPalette **palptr = src_model_data;
+  BablPalette *pal;
+  int best_idx = 0;
+  assert (palptr);
+  pal = *palptr;
+  assert(pal);
+
+  while (n--)
+    {
+      float *src_f = (void*) src_b;
+      unsigned char src[4];
+      int c;
+      for (c = 0; c < 3; c++)
+      {
+        if (src_f[c] >= 1.0f)
+          src[c] = 255;
+        else if (src_f[c] <= 0.0f)
+          src[c] = 0;
+        else
+          src[c] = babl_trc_from_linear (space->space.trc[0],
+                                         src_f[c]) * 255 + 0.5f;
+      }
+      if (src_f[3] >= 1.0f)
+        src[3] = 255;
+      else if (src_f[3] <= 0.0f)
+        src[3] = 0;
+      else
+        src[3] = src_f[3] * 255 + 0.5f;
+
+
+      dst[0] = best_idx = babl_palette_lookup (pal, src, best_idx);
+      dst[1] = src[3];
+
+      src_b += sizeof (float) * 4;
+      dst += sizeof (char) * 2;
+    }
+}
+
+
+static void
+rgba_float_to_pal (Babl          *conversion,
+                   unsigned char *src_b,
+                   unsigned char *dst,
+                   long           n,
+                   void          *src_model_data)
+{
+  const Babl *space = babl_conversion_get_destination_space (conversion);
+  BablPalette **palptr = src_model_data;
+  BablPalette *pal;
+  int best_idx = 0;
+  assert (palptr);
+  pal = *palptr;
+  assert(pal);
+
+  while (n--)
+    {
+      float *src_f = (void*) src_b;
+      unsigned char src[4];
+      int c;
+      for (c = 0; c < 3; c++)
+      {
+        if (src_f[c] >= 1.0f)
+          src[c] = 255;
+        else if (src_f[c] <= 0.0f)
+          src[c] = 0;
+        else
+          src[c] = babl_trc_from_linear (space->space.trc[0],
+                                         src_f[c]) * 255 + 0.5f;
+      }
+      if (src_f[3] >= 1.0f)
+        src[3] = 255;
+      else if (src_f[3] <= 0.0f)
+        src[3] = 0;
+      else
+        src[3] = src_f[3] * 255 + 0.5f;
+
+      dst[0] = best_idx = babl_palette_lookup (pal, src, best_idx);
+
+      src_b += sizeof (float) * 4;
+      dst += sizeof (char) * 1;
+    }
+}
+
+static void
 rgba_u8_to_pal (Babl          *conversion,
                 unsigned char *src,
                 unsigned char *dst,
@@ -328,12 +555,14 @@
 {
   BablPalette **palptr = src_model_data;
   BablPalette *pal;
+  int best_idx = 0;
   assert (palptr);
   pal = *palptr;
   assert(pal);
+
   while (n--)
     {
-      dst[0] = babl_palette_lookup (pal, src[0], src[1], src[2], src[3]);
+      dst[0] = best_idx = babl_palette_lookup (pal, src, best_idx);
 
       src += sizeof (char) * 4;
       dst += sizeof (char) * 1;
@@ -349,12 +578,13 @@
 {
   BablPalette **palptr = src_model_data;
   BablPalette *pal;
+  int best_idx = 0;
   assert (palptr);
   pal = *palptr;
   assert(pal);
   while (n--)
     {
-      dst[0] = babl_palette_lookup (pal, src[0], src[1], src[2], src[3]);
+      dst[0] = best_idx = babl_palette_lookup (pal, src, best_idx);
       dst[1] = src[3];
 
       src += sizeof (char) * 4;
@@ -510,7 +740,6 @@
     "chroma",
     NULL);
   alpha = babl_component ("A");
-  
   model = babl_model_new ("name", name, component, alpha, NULL);
   palptr = malloc (sizeof (void*));
   *palptr = default_palette ();;
@@ -558,30 +787,24 @@
      "data", palptr,
      NULL
   );
-
   babl_conversion_new (
      f_pal_u8,
      f_pal_a_u8,
      "linear", conv_pal8_pala8,
      NULL
   );
-
   babl_conversion_new (
      f_pal_a_u8,
      f_pal_u8,
      "linear", conv_pala8_pal8,
      NULL
   );
-
-
   babl_conversion_new (
      f_pal_u8,
      babl_format ("R'G'B'A u8"),
      "linear", pal_u8_to_rgba_u8,
      "data", palptr,
      NULL);
-
-
   babl_conversion_new (
      f_pal_a_u8,
      babl_format ("R'G'B'A u8"),
@@ -602,6 +825,19 @@
      "data", palptr,
      NULL);
 
+  babl_conversion_new (
+     babl_format ("RGBA float"),
+     f_pal_a_u8,
+     "linear", rgba_float_to_pal_a,
+     "data", palptr,
+     NULL);
+  babl_conversion_new (
+     babl_format ("RGBA float"),
+     f_pal_u8,
+     "linear", rgba_float_to_pal,
+     "data", palptr,
+     NULL);
+
   babl_set_user_data (model, palptr);
   babl_set_user_data (model_no_alpha, palptr);
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' --exclude Makefile.in --exclude configure --exclude 
config.guess --exclude '*.pot' --exclude mkinstalldirs --exclude aclocal.m4 
--exclude config.sub --exclude depcomp --exclude install-sh --exclude ltmain.sh 
old/babl-0.1.46/babl/babl-version.h new/babl-0.1.50/babl/babl-version.h
--- old/babl-0.1.46/babl/babl-version.h 2018-04-10 23:11:27.000000000 +0200
+++ new/babl-0.1.50/babl/babl-version.h 2018-05-20 10:48:54.000000000 +0200
@@ -34,7 +34,7 @@
 
 #define BABL_MAJOR_VERSION 0
 #define BABL_MINOR_VERSION 1
-#define BABL_MICRO_VERSION 46
+#define BABL_MICRO_VERSION 50
 
 /** Get the version information on the babl library */
 void   babl_get_version (int *major,
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' --exclude Makefile.in --exclude configure --exclude 
config.guess --exclude '*.pot' --exclude mkinstalldirs --exclude aclocal.m4 
--exclude config.sub --exclude depcomp --exclude install-sh --exclude ltmain.sh 
old/babl-0.1.46/babl/base/type-u8.c new/babl-0.1.50/babl/base/type-u8.c
--- old/babl-0.1.46/babl/base/type-u8.c 2017-09-29 00:30:45.000000000 +0200
+++ new/babl-0.1.50/babl/base/type-u8.c 2018-04-21 15:18:07.000000000 +0200
@@ -110,7 +110,7 @@
 
 MAKE_CONVERSIONS (u8, 0.0, 1.0, 0x00, UINT8_MAX)
 MAKE_CONVERSIONS (u8_luma, 0.0, 1.0, 16, 235)
-MAKE_CONVERSIONS (u8_chroma, 0.0, 1.0, 16, 240)
+MAKE_CONVERSIONS (u8_chroma, -0.5, 0.5, 16, 240)
 
 void
 babl_base_type_u8 (void)
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' --exclude Makefile.in --exclude configure --exclude 
config.guess --exclude '*.pot' --exclude mkinstalldirs --exclude aclocal.m4 
--exclude config.sub --exclude depcomp --exclude install-sh --exclude ltmain.sh 
old/babl-0.1.46/config.h.in new/babl-0.1.50/config.h.in
--- old/babl-0.1.46/config.h.in 2018-04-10 23:11:28.000000000 +0200
+++ new/babl-0.1.50/config.h.in 2018-05-20 10:48:54.000000000 +0200
@@ -123,5 +123,8 @@
 /* Define to 1 if SSE2 assembly is available. */
 #undef USE_SSE2
 
+/* Define to 1 if SSE3 assembly is available. */
+#undef USE_SSE3
+
 /* Define to 1 if SSE4_1 assembly is available. */
 #undef USE_SSE4_1
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' --exclude Makefile.in --exclude configure --exclude 
config.guess --exclude '*.pot' --exclude mkinstalldirs --exclude aclocal.m4 
--exclude config.sub --exclude depcomp --exclude install-sh --exclude ltmain.sh 
old/babl-0.1.46/configure.ac new/babl-0.1.50/configure.ac
--- old/babl-0.1.46/configure.ac        2018-04-10 23:04:31.000000000 +0200
+++ new/babl-0.1.50/configure.ac        2018-05-20 10:48:45.000000000 +0200
@@ -14,7 +14,7 @@
 
 m4_define([babl_major_version], [0])
 m4_define([babl_minor_version], [1])
-m4_define([babl_micro_version], [46])
+m4_define([babl_micro_version], [50])
 m4_define([babl_real_version],
           [babl_major_version.babl_minor_version.babl_micro_version])
 m4_define([babl_version], [babl_real_version])
@@ -161,6 +161,9 @@
 BABL_DETECT_CFLAGS(extra_warnings, '-Wold-style-definition')
 CFLAGS="$CFLAGS $extra_warnings"
 
+BABL_DETECT_CFLAGS(extra_warnings, '-Ofast' )
+CFLAGS="$CFLAGS $extra_warnings"
+
 fi
 
 AC_PATH_PROG(RSVG, rsvg-convert, no)
@@ -324,6 +327,10 @@
   [  --enable-sse2            enable SSE2 support (default=auto)],,
   enable_sse2=$enable_sse)
 
+AC_ARG_ENABLE(sse3,
+  [  --enable-sse3            enable SSE3 support (default=auto)],,
+  enable_sse3=$enable_sse2)
+
 AC_ARG_ENABLE(sse4_1,
   [  --enable-sse4_1            enable SSE4_1 support (default=auto)],,
   enable_sse4_1=$enable_sse)
@@ -388,22 +395,40 @@
           AC_MSG_WARN([The assembler does not support the SSE2 command set.])
         )
 
-        if test "x$enable_sse4_1" = xyes; then
-          BABL_DETECT_CFLAGS(sse4_1_flag, '-msse4.1')
-          SSE4_1_EXTRA_CFLAGS="$SSE_EXTRA_CFLAGS $sse4_1_flag"
+        if test "x$enable_sse3" = xyes; then
+          BABL_DETECT_CFLAGS(sse3_flag, '-msse3')
+          SSE3_EXTRA_CFLAGS="$SSE2_EXTRA_CFLAGS $sse3_flag"
 
-          AC_MSG_CHECKING(whether we can compile SSE4_1 code)
+          AC_MSG_CHECKING(whether we can compile SSE3 code)
 
-          CFLAGS="$CFLAGS $sse4_1_flag"
+          CFLAGS="$CFLAGS $sse3_flag"
 
-          AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("pmovzxbd 
%xmm0,%xmm1");])],
-            AC_DEFINE(USE_SSE4_1, 1, [Define to 1 if SSE4_1 assembly is 
available.])
+          AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("addsubpd 
%xmm0,%xmm1");])],
+            AC_DEFINE(USE_SSE3, 1, [Define to 1 if SSE3 assembly is 
available.])
             AC_MSG_RESULT(yes)
           ,
-            enable_sse4_1=no
+            enable_sse3=no
             AC_MSG_RESULT(no)
-            AC_MSG_WARN([The assembler does not support the SSE4_1 command 
set.])
+            AC_MSG_WARN([The assembler does not support the SSE3 command set.])
           )
+
+          if test "x$enable_sse4_1" = xyes; then
+            BABL_DETECT_CFLAGS(sse4_1_flag, '-msse4.1')
+            SSE4_1_EXTRA_CFLAGS="$SSE_EXTRA_CFLAGS $sse4_1_flag"
+
+            AC_MSG_CHECKING(whether we can compile SSE4_1 code)
+
+            CFLAGS="$CFLAGS $sse4_1_flag"
+
+            AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("pmovzxbd 
%xmm0,%xmm1");])],
+              AC_DEFINE(USE_SSE4_1, 1, [Define to 1 if SSE4_1 assembly is 
available.])
+              AC_MSG_RESULT(yes)
+            ,
+              enable_sse4_1=no
+              AC_MSG_RESULT(no)
+              AC_MSG_WARN([The assembler does not support the SSE4_1 command 
set.])
+            )
+          fi
         fi
       fi
 
@@ -439,6 +464,7 @@
   AC_SUBST(MMX_EXTRA_CFLAGS)
   AC_SUBST(SSE_EXTRA_CFLAGS)
   AC_SUBST(SSE2_EXTRA_CFLAGS)
+  AC_SUBST(SSE3_EXTRA_CFLAGS)
   AC_SUBST(SSE4_1_EXTRA_CFLAGS)
   AC_SUBST(F16C_EXTRA_CFLAGS)
 fi
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' --exclude Makefile.in --exclude configure --exclude 
config.guess --exclude '*.pot' --exclude mkinstalldirs --exclude aclocal.m4 
--exclude config.sub --exclude depcomp --exclude install-sh --exclude ltmain.sh 
old/babl-0.1.46/extensions/CIE.c new/babl-0.1.50/extensions/CIE.c
--- old/babl-0.1.46/extensions/CIE.c    2018-04-07 18:14:44.000000000 +0200
+++ new/babl-0.1.50/extensions/CIE.c    2018-05-15 19:07:48.000000000 +0200
@@ -2,7 +2,7 @@
  * Copyright (C) 2005, 2014 Øyvind Kolås.
  * Copyright (C) 2009, Martin Nordholts
  * Copyright (C) 2014, Elle Stone
- * Copyright (C) 2017, Red Hat, Inc.
+ * Copyright (C) 2017, 2018 Red Hat, Inc.
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -21,8 +21,13 @@
 
 #include "config.h"
 #include <math.h>
+#include <stdint.h>
 #include <string.h>
 
+#if defined(USE_SSE2)
+#include <emmintrin.h>
+#endif /* defined(USE_SSE2) */
+
 #include "babl-internal.h"
 #include "extensions/util.h"
 
@@ -172,24 +177,17 @@
             double *to_a,
             double *to_b)
 {
-  double f_x, f_y, f_z;
-
-  double x_r = X / D50_WHITE_REF_X;
-  double y_r = Y / D50_WHITE_REF_Y;
-  double z_r = Z / D50_WHITE_REF_Z;
-
-  if (x_r > LAB_EPSILON) f_x = cbrt(x_r);
-  else ( f_x = ((LAB_KAPPA * x_r) + 16) / 116.0 );
-
-  if (y_r > LAB_EPSILON) f_y = cbrt(y_r);
-  else ( f_y = ((LAB_KAPPA * y_r) + 16) / 116.0 );
-
-  if (z_r > LAB_EPSILON) f_z = cbrt(z_r);
-  else ( f_z = ((LAB_KAPPA * z_r) + 16) / 116.0 );
-
-  *to_L = (116.0 * f_y) - 16.0;
-  *to_a = 500.0 * (f_x - f_y);
-  *to_b = 200.0 * (f_y - f_z);
+  double xr = X / D50_WHITE_REF_X;
+  double yr = Y / D50_WHITE_REF_Y;
+  double zr = Z / D50_WHITE_REF_Z;
+
+  double fx = xr > LAB_EPSILON ? cbrt (xr) : (LAB_KAPPA * xr + 16.0) / 116.0;
+  double fy = yr > LAB_EPSILON ? cbrt (yr) : (LAB_KAPPA * yr + 16.0) / 116.0;
+  double fz = zr > LAB_EPSILON ? cbrt (zr) : (LAB_KAPPA * zr + 16.0) / 116.0;
+
+  *to_L = 116.0 * fy - 16.0;
+  *to_a = 500.0 * (fx - fy);
+  *to_b = 200.0 * (fy - fz);
 }
 
 static inline void
@@ -200,26 +198,18 @@
             double *to_Y,
             double *to_Z)
 {
-  double fy, fx, fz, fx_cubed, fy_cubed, fz_cubed;
-  double xr, yr, zr;
-
-  fy = (L + 16.0) / 116.0;
-  fy_cubed = fy*fy*fy;
-
-  fz = fy - (b / 200.0);
-  fz_cubed = fz*fz*fz;
+  double fy = (L + 16.0) / 116.0;
+  double fy_cubed = fy * fy * fy;
 
-  fx = (a / 500.0) + fy;
-  fx_cubed = fx*fx*fx;
+  double fx = fy + a / 500.0;
+  double fx_cubed = fx * fx * fx;
 
-  if (fx_cubed > LAB_EPSILON) xr = fx_cubed;
-  else xr = ((116.0 * fx) - 16) / LAB_KAPPA;
+  double fz = fy - b / 200.0;
+  double fz_cubed = fz * fz * fz;
 
-  if ( L > (LAB_KAPPA * LAB_EPSILON) ) yr = fy_cubed;
-  else yr = (L / LAB_KAPPA);
-
-  if (fz_cubed > LAB_EPSILON) zr = fz_cubed;
-  else zr = ( (116.0 * fz) - 16 ) / LAB_KAPPA;
+  double yr = L > LAB_KAPPA * LAB_EPSILON ? fy_cubed : L / LAB_KAPPA;
+  double xr = fx_cubed > LAB_EPSILON ? fx_cubed : (fx * 116.0 - 16.0) / 
LAB_KAPPA;
+  double zr = fz_cubed > LAB_EPSILON ? fz_cubed : (fz * 116.0 - 16.0) / 
LAB_KAPPA;
 
   *to_X = xr * D50_WHITE_REF_X;
   *to_Y = yr * D50_WHITE_REF_Y;
@@ -572,8 +562,6 @@
  * Return cube root of x
  */
 
-#include <stdint.h>
-
 static inline float
 _cbrtf (float x)
 {
@@ -1049,6 +1037,267 @@
     }
 }
 
+#if defined(USE_SSE2)
+
+/* This is an SSE2 version of Halley's method for approximating the
+ * cube root of an IEEE float implementation.
+ *
+ * The scalar version is as follows:
+ *
+ * static inline float
+ * _cbrt_5f (float x)
+ * {
+ *   union { float f; uint32_t i; } u = { x };
+ *
+ *   u.i = u.i / 3 + 709921077;
+ *   return u.f;
+ * }
+ *
+ * static inline float
+ * _cbrta_halleyf (float a, float R)
+ * {
+ *   float a3 = a * a * a;
+ *   float b = a * (a3 + R + R) / (a3 + a3 + R);
+ *   return b;
+ * }
+ *
+ * static inline float
+ * _cbrtf (float x)
+ * {
+ *   float a;
+ *
+ *   a = _cbrt_5f (x);
+ *   a = _cbrta_halleyf (a, x);
+ *   a = _cbrta_halleyf (a, x);
+ *   return a;
+ * }
+ *
+ * The above scalar version seems to have originated from
+ * http://metamerist.com/cbrt/cbrt.htm but that's not accessible
+ * anymore. At present there's a copy in CubeRoot.cpp in the Skia
+ * sources that's licensed under a BSD-style license. There's some
+ * discussion on the implementation at
+ * http://www.voidcn.com/article/p-gpwztojr-wt.html.
+ *
+ * Note that Darktable also has an SSE2 version of the same algorithm,
+ * but uses only a single iteration of Halley's method, which is too
+ * coarse.
+ */
+/* Return cube roots of the four single-precision floating point
+ * components of x.
+ */
+static inline __m128
+_cbrtf_ps_sse2 (__m128 x)
+{
+  const __m128i magic = _mm_set1_epi32 (709921077);
+
+  __m128i xi = _mm_castps_si128 (x);
+  __m128 xi_3 = _mm_div_ps (_mm_cvtepi32_ps (xi), _mm_set1_ps (3.0f));
+  __m128i ai = _mm_add_epi32 (_mm_cvtps_epi32 (xi_3), magic);
+  __m128 a = _mm_castsi128_ps (ai);
+
+  __m128 a3 = _mm_mul_ps (_mm_mul_ps (a, a), a);
+  __m128 divisor = _mm_add_ps (_mm_add_ps (a3, a3), x);
+  a = _mm_div_ps (_mm_mul_ps (a, _mm_add_ps (a3, _mm_add_ps (x, x))), divisor);
+
+  a3 = _mm_mul_ps (_mm_mul_ps (a, a), a);
+  divisor = _mm_add_ps (_mm_add_ps (a3, a3), x);
+  a = _mm_div_ps (_mm_mul_ps (a, _mm_add_ps (a3, _mm_add_ps (x, x))), divisor);
+
+  return a;
+}
+
+static inline __m128
+lab_r_to_f_sse2 (__m128 r)
+{
+  const __m128 epsilon = _mm_set1_ps (LAB_EPSILON);
+  const __m128 kappa = _mm_set1_ps (LAB_KAPPA);
+
+  const __m128 f_big = _cbrtf_ps_sse2 (r);
+
+  const __m128 f_small = _mm_div_ps (_mm_add_ps (_mm_mul_ps (kappa, r), 
_mm_set1_ps (16.0f)),
+                                     _mm_set1_ps (116.0f));
+
+  const __m128 mask = _mm_cmpgt_ps (r, epsilon);
+  const __m128 f = _mm_or_ps (_mm_and_ps (mask, f_big), _mm_andnot_ps (mask, 
f_small));
+  return f;
+}
+
+static void
+rgbaf_to_Lf_sse2 (const Babl *conversion, const float *src, float *dst, long 
samples)
+{
+  const Babl *space = babl_conversion_get_source_space (conversion);
+  const float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Y;
+  const float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Y;
+  const float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Y;
+  long i = 0;
+  long remainder;
+
+  if (((uintptr_t) src % 16) + ((uintptr_t) dst % 16) == 0)
+    {
+      const long    n = (samples / 4) * 4;
+      const __m128 m_1_0_v = _mm_set1_ps (m_1_0);
+      const __m128 m_1_1_v = _mm_set1_ps (m_1_1);
+      const __m128 m_1_2_v = _mm_set1_ps (m_1_2);
+
+      for ( ; i < n; i += 4)
+        {
+          __m128 rgba0 = _mm_load_ps (src);
+          __m128 rgba1 = _mm_load_ps (src + 4);
+          __m128 rgba2 = _mm_load_ps (src + 8);
+          __m128 rgba3 = _mm_load_ps (src + 12);
+
+          __m128 r = rgba0;
+          __m128 g = rgba1;
+          __m128 b = rgba2;
+          __m128 a = rgba3;
+          _MM_TRANSPOSE4_PS (r, g, b, a);
+
+          {
+            __m128 yr = _mm_add_ps (_mm_add_ps (_mm_mul_ps (m_1_0_v, r), 
_mm_mul_ps (m_1_1_v, g)),
+                                    _mm_mul_ps (m_1_2_v, b));
+
+            __m128 fy = lab_r_to_f_sse2 (yr);
+
+            __m128 L = _mm_sub_ps (_mm_mul_ps (_mm_set1_ps (116.0f), fy), 
_mm_set1_ps (16.0f));
+
+            _mm_store_ps (dst, L);
+          }
+
+          src += 16;
+          dst += 4;
+        }
+    }
+
+  remainder = samples - i;
+  while (remainder--)
+    {
+      float r = src[0];
+      float g = src[1];
+      float b = src[2];
+
+      float yr = m_1_0 * r + m_1_1 * g + m_1_2 * b;
+      float L = yr > LAB_EPSILON ? 116.0f * _cbrtf (yr) - 16 : LAB_KAPPA * yr;
+
+      dst[0] = L;
+
+      src += 4;
+      dst += 1;
+    }
+}
+
+static void
+rgbaf_to_Labaf_sse2 (const Babl *conversion, const float *src, float *dst, 
long samples)
+{
+  const Babl *space = babl_conversion_get_source_space (conversion);
+  const float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_X;
+  const float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_X;
+  const float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_X;
+  const float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Y;
+  const float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Y;
+  const float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Y;
+  const float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Z;
+  const float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Z;
+  const float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Z;
+  long i = 0;
+  long remainder;
+
+  if (((uintptr_t) src % 16) + ((uintptr_t) dst % 16) == 0)
+    {
+      const long    n = (samples / 4) * 4;
+      const __m128 m_0_0_v = _mm_set1_ps (m_0_0);
+      const __m128 m_0_1_v = _mm_set1_ps (m_0_1);
+      const __m128 m_0_2_v = _mm_set1_ps (m_0_2);
+      const __m128 m_1_0_v = _mm_set1_ps (m_1_0);
+      const __m128 m_1_1_v = _mm_set1_ps (m_1_1);
+      const __m128 m_1_2_v = _mm_set1_ps (m_1_2);
+      const __m128 m_2_0_v = _mm_set1_ps (m_2_0);
+      const __m128 m_2_1_v = _mm_set1_ps (m_2_1);
+      const __m128 m_2_2_v = _mm_set1_ps (m_2_2);
+
+      for ( ; i < n; i += 4)
+        {
+          __m128 Laba0;
+          __m128 Laba1;
+          __m128 Laba2;
+          __m128 Laba3;
+
+          __m128 rgba0 = _mm_load_ps (src);
+          __m128 rgba1 = _mm_load_ps (src + 4);
+          __m128 rgba2 = _mm_load_ps (src + 8);
+          __m128 rgba3 = _mm_load_ps (src + 12);
+
+          __m128 r = rgba0;
+          __m128 g = rgba1;
+          __m128 b = rgba2;
+          __m128 a = rgba3;
+          _MM_TRANSPOSE4_PS (r, g, b, a);
+
+          {
+            __m128 xr = _mm_add_ps (_mm_add_ps (_mm_mul_ps (m_0_0_v, r), 
_mm_mul_ps (m_0_1_v, g)),
+                                    _mm_mul_ps (m_0_2_v, b));
+            __m128 yr = _mm_add_ps (_mm_add_ps (_mm_mul_ps (m_1_0_v, r), 
_mm_mul_ps (m_1_1_v, g)),
+                                    _mm_mul_ps (m_1_2_v, b));
+            __m128 zr = _mm_add_ps (_mm_add_ps (_mm_mul_ps (m_2_0_v, r), 
_mm_mul_ps (m_2_1_v, g)),
+                                    _mm_mul_ps (m_2_2_v, b));
+
+            __m128 fx = lab_r_to_f_sse2 (xr);
+            __m128 fy = lab_r_to_f_sse2 (yr);
+            __m128 fz = lab_r_to_f_sse2 (zr);
+
+            __m128 L = _mm_sub_ps (_mm_mul_ps (_mm_set1_ps (116.0f), fy), 
_mm_set1_ps (16.0f));
+            __m128 A = _mm_mul_ps (_mm_set1_ps (500.0f), _mm_sub_ps (fx, fy));
+            __m128 B = _mm_mul_ps (_mm_set1_ps (200.0f), _mm_sub_ps (fy, fz));
+
+            Laba0 = L;
+            Laba1 = A;
+            Laba2 = B;
+            Laba3 = a;
+            _MM_TRANSPOSE4_PS (Laba0, Laba1, Laba2, Laba3);
+          }
+
+          _mm_store_ps (dst, Laba0);
+          _mm_store_ps (dst + 4, Laba1);
+          _mm_store_ps (dst + 8, Laba2);
+          _mm_store_ps (dst + 12, Laba3);
+
+          src += 16;
+          dst += 16;
+        }
+    }
+
+  remainder = samples - i;
+  while (remainder--)
+    {
+      float r = src[0];
+      float g = src[1];
+      float b = src[2];
+      float a = src[3];
+
+      float xr = m_0_0 * r + m_0_1 * g + m_0_2 * b;
+      float yr = m_1_0 * r + m_1_1 * g + m_1_2 * b;
+      float zr = m_2_0 * r + m_2_1 * g + m_2_2 * b;
+
+      float fx = xr > LAB_EPSILON ? _cbrtf (xr) : (LAB_KAPPA * xr + 16.0f) / 
116.0f;
+      float fy = yr > LAB_EPSILON ? _cbrtf (yr) : (LAB_KAPPA * yr + 16.0f) / 
116.0f;
+      float fz = zr > LAB_EPSILON ? _cbrtf (zr) : (LAB_KAPPA * zr + 16.0f) / 
116.0f;
+
+      float L = 116.0f * fy - 16.0f;
+      float A = 500.0f * (fx - fy);
+      float B = 200.0f * (fy - fz);
+
+      dst[0] = L;
+      dst[1] = A;
+      dst[2] = B;
+      dst[3] = a;
+
+      src += 4;
+      dst += 4;
+    }
+}
+
+#endif /* defined(USE_SSE2) */
+
 static void
 conversions (void)
 {
@@ -1215,6 +1464,27 @@
     NULL
   );
 
+#if defined(USE_SSE2)
+
+  if (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE2)
+    {
+      babl_conversion_new (
+        babl_format ("RGBA float"),
+        babl_format ("CIE Lab alpha float"),
+        "linear", rgbaf_to_Labaf_sse2,
+        NULL
+      );
+
+      babl_conversion_new (
+        babl_format ("RGBA float"),
+        babl_format ("CIE L float"),
+        "linear", rgbaf_to_Lf_sse2,
+        NULL
+      );
+    }
+
+#endif /* defined(USE_SSE2) */
+
   rgbcie_init ();
 }
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' --exclude Makefile.in --exclude configure --exclude 
config.guess --exclude '*.pot' --exclude mkinstalldirs --exclude aclocal.m4 
--exclude config.sub --exclude depcomp --exclude install-sh --exclude ltmain.sh 
old/babl-0.1.46/extensions/Makefile.am new/babl-0.1.50/extensions/Makefile.am
--- old/babl-0.1.46/extensions/Makefile.am      2018-04-07 17:08:01.000000000 
+0200
+++ new/babl-0.1.50/extensions/Makefile.am      2018-05-10 14:41:10.000000000 
+0200
@@ -71,6 +71,7 @@
 LIBS =  $(top_builddir)/babl/libbabl-@[email protected] \
        $(MATH_LIB) $(THREAD_LIB)
 
+CIE_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
 sse2_float_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
 sse2_int8_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
 sse2_int16_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' --exclude Makefile.in --exclude configure --exclude 
config.guess --exclude '*.pot' --exclude mkinstalldirs --exclude aclocal.m4 
--exclude config.sub --exclude depcomp --exclude install-sh --exclude ltmain.sh 
old/babl-0.1.46/extensions/two-table.c new/babl-0.1.50/extensions/two-table.c
--- old/babl-0.1.46/extensions/two-table.c      2018-04-07 16:38:23.000000000 
+0200
+++ new/babl-0.1.50/extensions/two-table.c      2018-04-24 20:14:52.000000000 
+0200
@@ -205,17 +205,6 @@
 
   if (littleendian)
     {
-      const Babl *f32 = babl_format_new (
-        "name", "cairo-ARGB32",
-        babl_model ("R'aG'aB'aA"),
-        babl_type ("u8"),
-        babl_component ("B'a"),
-        babl_component ("G'a"),
-        babl_component ("R'a"),
-        babl_component ("A"),
-        NULL
-      );
-
       const Babl *f24 = babl_format_new (
         "name", "cairo-RGB24",
         babl_model ("R'G'B'"),
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' --exclude Makefile.in --exclude configure --exclude 
config.guess --exclude '*.pot' --exclude mkinstalldirs --exclude aclocal.m4 
--exclude config.sub --exclude depcomp --exclude install-sh --exclude ltmain.sh 
old/babl-0.1.46/tests/palette-concurrency-stress-test.c 
new/babl-0.1.50/tests/palette-concurrency-stress-test.c
--- old/babl-0.1.46/tests/palette-concurrency-stress-test.c     2017-09-21 
21:06:11.000000000 +0200
+++ new/babl-0.1.50/tests/palette-concurrency-stress-test.c     2018-05-19 
17:21:32.000000000 +0200
@@ -77,13 +77,13 @@
 
       v = i * BABL_PALETTE_HASH_TABLE_SIZE;
 
-      p[0] = (v >> 16) & 0xff;
+      p[0] = (v >>  0) & 0xff;
       p[1] = (v >>  8) & 0xff;
-      p[2] = (v >>  0) & 0xff;
+      p[2] = (v >> 16) & 0xff;
       p[3] = 0xff;
     }
 
-  babl_palette_set_palette (pal, babl_format ("RGBA u8"), colors, N_THREADS);
+  babl_palette_set_palette (pal, babl_format ("R'G'B'A u8"), colors, 
N_THREADS);
 
   /* initialize the thread contexts such that each thread processes a buffer
    * containing a single, distinct color
@@ -92,7 +92,7 @@
     {
       ctx[i] = malloc (sizeof (ThreadContext));
 
-      ctx[i]->fish = babl_fish (babl_format ("RGBA u8"), pal_format);
+      ctx[i]->fish = babl_fish (babl_format ("R'G'B'A u8"), pal_format);
 
       for (j = 0; j < 4 * N_PIXELS; j++)
         {
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' --exclude Makefile.in --exclude configure --exclude 
config.guess --exclude '*.pot' --exclude mkinstalldirs --exclude aclocal.m4 
--exclude config.sub --exclude depcomp --exclude install-sh --exclude ltmain.sh 
old/babl-0.1.46/tools/babl_fish_path_fitness.c 
new/babl-0.1.50/tools/babl_fish_path_fitness.c
--- old/babl-0.1.46/tools/babl_fish_path_fitness.c      2018-04-08 
15:28:44.000000000 +0200
+++ new/babl-0.1.50/tools/babl_fish_path_fitness.c      2018-05-19 
13:34:30.000000000 +0200
@@ -91,7 +91,7 @@
 static int source_each (Babl *babl,
                         void *userdata)
 {
-  printf (SL);
+  printf ("%s", SL);
   babl_format_class_for_each (destination_each, babl);
 #ifdef UTF8
   printf ("──%2i %s%s", source_no++, babl->instance.name, NL);


Reply via email to