bug#6366: join can't join on numeric fields

Drew Frank Wed, 21 Mar 2012 09:12:01 -0700

Hi,

I've attached a patch that implements both "numeric sort" (-n) and "general
numeric sort" (-g).. I copied the relevant comparison code from sort.c to
ensure consistent behavior. Should these functions be extracted and put in
a shared header file, or is it okay to duplicate the code?


In terms of testing, I confirmed that the code correctly deals with
different single-byte thousand separators on my local machine, but I wasn't
sure how to exercise that in the test suite since I can't rely on everyone
having a particular locale available. How should that be handled?

I also updated the relevant docs.

Thanks,
Drew

P.S. I sent a previous version of the patch to the wrong address and
accidentally initiated a new bug thread (#10924). This patch supersedes the
previous.

From 99a9c4c7d7a828a917e14f7e5241921a4dc77909 Mon Sep 17 00:00:00 2001
From: Drew Frank <[email protected]>
Date: Thu, 1 Mar 2012 14:24:49 -0800
Subject: [PATCH] join: add numeric sort feature (-g and -n flags).

* src/join.c: add flags and implement numeric comparison features.
* tests/misc/join: add three tests for numerically sorted key fields.
* doc/coreutils.texi: document new functionality.
---
 doc/coreutils.texi |   16 ++++++
 gnulib             |    2 +-
 src/join.c         |  157 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 tests/misc/join    |    8 +++
 4 files changed, 178 insertions(+), 5 deletions(-)

diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 10be715..017118f 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -5798,6 +5798,14 @@ specified format.  The header lines will not be checked for ordering even if
 @option{--check-order} is specified.  Also if the header lines from each file
 do not match, the heading fields from the first file will be used.
 
+@item -g
+@itemx --general-numeric-sort
+@opindex -g
+@opindex --general-numeric-sort
+Compute general numerical value when comparing keys.
+With this option, the lines of the input files must be ordered in the same way.
+Use @samp{sort -g} to produce this ordering.
+
 @item -i
 @itemx --ignore-case
 @opindex -i
@@ -5806,6 +5814,14 @@ Ignore differences in case when comparing keys.
 With this option, the lines of the input files must be ordered in the same way.
 Use @samp{sort -f} to produce this ordering.
 
+@item -n
+@itemx --numeric-sort
+@opindex -n
+@opindex --numeric-sort
+Compute string numerical value when comparing keys.
+With this option, the lines of the input files must be ordered in the same way.
+Use @samp{sort -n} to produce this ordering.
+
 @item -1 @var{field}
 @opindex -1
 Join on field @var{field} (a positive integer) of file 1.
diff --git a/gnulib b/gnulib
index 4730c3e..cbc11ff 160000
--- a/gnulib
+++ b/gnulib
@@ -1 +1 @@
-Subproject commit 4730c3e3692b344effb72d46b3ff92db0bdb797a
+Subproject commit cbc11ff0020eb9c04caea6b3e7dc4e4281dff1f9
diff --git a/src/join.c b/src/join.c
index b92c1f8..95dac47 100644
--- a/src/join.c
+++ b/src/join.c
@@ -30,6 +30,7 @@
 #include "memcasecmp.h"
 #include "quote.h"
 #include "stdio--.h"
+#include "strnumcmp.h"
 #include "xmemcoll.h"
 #include "xstrtol.h"
 #include "argmatch.h"
@@ -47,6 +48,16 @@
   b = tmp; \
 } while (0);
 
+#define UCHAR_LIM (UCHAR_MAX + 1)
+
+#if HAVE_C99_STRTOLD
+# define long_double long double
+#else
+# define long_double double
+# undef strtold
+# define strtold strtod
+#endif
+
 /* An element of the list identifying which fields to print for each
    output line.  */
 struct outlist
@@ -100,6 +111,12 @@ static char *g_names[2];
    want to overwrite the previous buffer before we check order. */
 static struct line *spareline[2] = {NULL, NULL};
 
+/* The representation of the decimal point in the current locale.  */
+static int decimal_point;
+
+/* Thousands separator; if -1, then there isn't one.  */
+static int thousands_sep;
+
 /* True if the LC_COLLATE locale is hard.  */
 static bool hard_LC_COLLATE;
 
@@ -140,6 +157,9 @@ static struct outlist *outlist_end = &outlist_head;
    tab character whose value (when cast to unsigned char) equals TAB.  */
 static int tab = -1;
 
+/* Table of blanks.  */
+static bool blanks[UCHAR_LIM];
+
 /* If nonzero, check that the input is correctly ordered. */
 static enum
   {
@@ -158,7 +178,9 @@ enum
 
 static struct option const longopts[] =
 {
+  {"general-numeric-sort", no_argument, NULL, 'g'},
   {"ignore-case", no_argument, NULL, 'i'},
+  {"numeric-sort", no_argument, NULL, 'n'},
   {"check-order", no_argument, NULL, CHECK_ORDER_OPTION},
   {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION},
   {"header", no_argument, NULL, HEADER_LINE_OPTION},
@@ -173,6 +195,12 @@ static struct line uni_blank;
 /* If nonzero, ignore case when comparing join fields.  */
 static bool ignore_case;
 
+/* If nonzero, treat keys as numeric values.  */
+static bool numeric_sort;
+
+/* If nonzero, treat keys as general numeric values.  */
+static bool general_numeric_sort;
+
 /* If nonzero, treat the first line of each file as column headers -
    join them without checking for ordering */
 static bool join_header_lines;
@@ -198,7 +226,9 @@ by whitespace.  When FILE1 or FILE2 (not both) is -, read standard input.\n\
   -e EMPTY          replace missing input fields with EMPTY\n\
 "), stdout);
       fputs (_("\
-  -i, --ignore-case  ignore differences in case when comparing fields\n\
+  -g, --general-numeric-sort  compare according to general numerical value\n\
+  -i, --ignore-case   ignore differences in case when comparing fields\n\
+  -n, --numeric-sort  compare according to string numerical value\n\
   -j FIELD          equivalent to '-1 FIELD -2 FIELD'\n\
   -o FORMAT         obey FORMAT while constructing output line\n\
   -t CHAR           use CHAR as input and output field separator\n\
@@ -303,6 +333,73 @@ freeline (struct line *line)
   line->buf.buffer = NULL;
 }
 
+/* Compare strings A and B as numbers without explicitly converting them to
+   machine numbers.  Comparatively slow for short strings, but asymptotically
+   hideously fast. Copied from sort.c. */
+
+static int
+numcompare (char const *a, char const *b)
+{
+  while (blanks[to_uchar (*a)])
+    a++;
+  while (blanks[to_uchar (*b)])
+    b++;
+
+  return strnumcmp (a, b, decimal_point, thousands_sep);
+}
+
+/* Work around a problem whereby the long double value returned by glibc's
+   strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
+   A and B before calling strtold.  FIXME: remove this function once
+   gnulib guarantees that strtold's result is always well defined.
+   Copied from sort.c. */
+
+static int
+nan_compare (char const *sa, char const *sb)
+{
+  long_double a;
+  memset (&a, 0, sizeof a);
+  a = strtold (sa, NULL);
+
+  long_double b;
+  memset (&b, 0, sizeof b);
+  b = strtold (sb, NULL);
+
+  return memcmp (&a, &b, sizeof a);
+}
+
+
+/* Compare general numerical value by conversion to long double.
+ * Copied from sort.c. */
+
+static int
+general_numcompare (char const *sa, char const *sb)
+{
+  /* FIXME: maybe add option to try expensive FP conversion
+     only if A and B can't be compared more cheaply/accurately.  */
+
+  char *ea;
+  char *eb;
+  long_double a = strtold (sa, &ea);
+  long_double b = strtold (sb, &eb);
+
+  /* Put conversion errors at the start of the collating sequence.  */
+  if (sa == ea)
+    return sb == eb ? 0 : -1;
+  if (sb == eb)
+    return 1;
+
+  /* Sort numbers in the usual way, where -0 == +0.  Put NaNs after
+     conversion errors but before numbers; sort them by internal
+     bit-pattern, for lack of a more portable alternative.  */
+  return (a < b ? -1
+          : a > b ? 1
+          : a == b ? 0
+          : b == b ? -1
+          : a == a ? 1
+          : nan_compare (sa, sb));
+}
+
 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
    >0 if it compares greater; 0 if it compares equal.
    Report an error and exit if the comparison fails.
@@ -318,6 +415,7 @@ keycmp (struct line const *line1, struct line const *line2,
 
   size_t len1;
   size_t len2;		/* Length of fields to compare.  */
+  long double x1, x2;
   int diff;
 
   if (jf_1 < line1->nfields)
@@ -347,7 +445,19 @@ keycmp (struct line const *line1, struct line const *line2,
   if (len2 == 0)
     return 1;
 
-  if (ignore_case)
+  if (numeric_sort)
+    {
+      char end1 = beg1[len1];
+      char end2 = beg2[len2];
+      beg1[len1] = '\0'; beg2[len2] = '\0';
+      diff = numcompare (beg1, beg2);
+      beg1[len1] = end1; beg2[len2] = end2;
+    }
+  else if (general_numeric_sort)
+    {
+        diff = general_numcompare (beg1, beg2);
+    }
+  else if (ignore_case)
     {
       /* FIXME: ignore_case does not work with NLS (in particular,
          with multibyte chars).  */
@@ -360,7 +470,7 @@ keycmp (struct line const *line1, struct line const *line2,
       diff = memcmp (beg1, beg2, MIN (len1, len2));
     }
 
-  if (diff)
+  if (diff || numeric_sort || general_numeric_sort)
     return diff;
   return len1 < len2 ? -1 : len1 != len2;
 }
@@ -412,6 +522,19 @@ check_order (const struct line *prev,
     }
 }
 
+/* Initialize the character class tables. */
+
+static void
+inittables (void)
+{
+  size_t i;
+
+  for (i = 0; i < UCHAR_LIM; ++i)
+    {
+      blanks[i] = !! isblank (i);
+    }
+}
+
 static inline void
 reset_line (struct line *line)
 {
@@ -1009,6 +1132,24 @@ main (int argc, char **argv)
   textdomain (PACKAGE);
   hard_LC_COLLATE = hard_locale (LC_COLLATE);
 
+  /* Get locale's representation of the decimal point.  */
+  {
+    struct lconv const *locale = localeconv ();
+
+    /* If the locale doesn't define a decimal point, or if the decimal
+       point is multibyte, use the C locale's decimal point.  FIXME:
+       add support for multibyte decimal points.  */
+    decimal_point = to_uchar (locale->decimal_point[0]);
+    if (! decimal_point || locale->decimal_point[1])
+      decimal_point = '.';
+
+    /* FIXME: add support for multibyte thousands separators.  */
+    thousands_sep = to_uchar (*locale->thousands_sep);
+    if (! thousands_sep || locale->thousands_sep[1])
+      thousands_sep = -1;
+  }
+  inittables ();
+
   atexit (close_stdout);
   atexit (free_spareline);
 
@@ -1017,7 +1158,7 @@ main (int argc, char **argv)
   issued_disorder_warning[0] = issued_disorder_warning[1] = false;
   check_input_order = CHECK_ORDER_DEFAULT;
 
-  while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:",
+  while ((optc = getopt_long (argc, argv, "-a:e:gin1:2:j:o:t:v:",
                               longopts, NULL))
          != -1)
     {
@@ -1050,10 +1191,18 @@ main (int argc, char **argv)
           empty_filler = optarg;
           break;
 
+        case 'g':
+          general_numeric_sort = true;
+          break;
+
         case 'i':
           ignore_case = true;
           break;
 
+        case 'n':
+          numeric_sort = true;
+          break;
+
         case '1':
           set_join_field (&join_field_1, string_to_join_field (optarg));
           break;
diff --git a/tests/misc/join b/tests/misc/join
index a3fd1a8..2dce94e 100755
--- a/tests/misc/join
+++ b/tests/misc/join
@@ -147,6 +147,14 @@ my @tv = (
  ["a,1,,2\nb,1,2\n", "a,3,4\nb,3,4\n"],
  "a,1,,2,3,4\nb,1,2,,3,4\n"],
 
+# Join on numerically sorted field.
+['numeric-1', '-n', ["7 s\n8 e\n10 t\n4000 f\n", "7 S\n9 N\n4000 F\n"],
+    "7 s S\n4000 f F\n", 0],
+['numeric-2', '',   ["7 s\n8 e\n10 t\n", "7 S\n9 N\n10 T\n"],
+    "7 s S\n", 1, "$prog: numeric-2.2:3: is not sorted: 10 T\n"],
+['numeric-3', '-g',   ["7 s\n5e2 f\n800 e", "70e-1 S\n500.0 F\n"],
+    "7 s S\n5e2 f F\n", 0],
+
 # From Tim Smithers: fixed in 1.22l
 ['trailing-sp', '-t: -1 1 -2 1', ["a:x \n", "a:y \n"], "a:x :y \n", 0],
 
-- 
1.7.9.4

bug#6366: join can't join on numeric fields

Reply via email to