bug#6903: join: improve paralleles to sort?

Bernhard Schiffner Thu, 26 Aug 2010 12:39:49 -0700

Am Mittwoch, 25. August 2010, 18:22:13 schrieb Paul Eggert:
> On 08/24/2010 11:57 PM, Bernhard Schiffner wrote:
> > 2146427     /LBAtoJM/ROOT/WEB-INF/lib/hibernate-3.2.0.cr3.jar
> > 214618118   /temp/marketing_ms/emails.dat
> 
> That won't work, because the two lines are not sorted correctly.
> Recall that join uses lexicographic comparison, not numeric.
> Its input must be sorted lexicographically.


Ok.
I solved my problem using the attached patch.

The patch shows that it is possible to use different sortings for keys 
(joinfield) in join.

I integrated some / most of the code from sort.c verbaly  in order to see 
what's needed to compile it successfully in join.c .
I did no tests beside my special usecase mentioned earlier.

It's clear that a user-friendly key-selection needs a lot more work. Same is 
about a unified version of join and sort.

Thanks to Paul and Christian Perle for their valueable help so far.

The FSF can make any use of the code here. 
It was theirs already before  ;-)


Bernhard

diff --git a/src/join.c b/src/join.c
index fa18c9d..b02dc08 100644
--- a/src/join.c
+++ b/src/join.c
@@ -47,6 +47,308 @@
   b = tmp; \
 } while (0);
 
+/*
+ * The code here is an as verbal copy from sort.c. as possible.
+ * It's here to test the unification of key handling between sort and join.
+ *
+ * Test it in join.c first and decide about move into a shared library
+ * later on.
+ */
+
+/* TODO : check includes carefully */
+#include "strnumcmp.h"
+/* #include "hard-locale.h" */
+#include "langinfo.h"
+#include "strnumcmp.h"
+/* #include "unistr.h" */
+
+#define UCHAR_LIM (UCHAR_MAX + 1)
+
+#if HAVE_C99_STRTOLD
+# define long_double long double
+#else
+# define long_double double
+# undef strtold
+# define strtold strtod
+#endif
+
+/* The representation of the decimal point in the current locale.  */
+static int decimal_point;
+
+/* Thousands separator; if -1, then there isn't one.  */
+static int thousands_sep;
+
+/* Nonzero if the corresponding locales are hard.  */
+static bool hard_LC_COLLATE;
+#if HAVE_NL_LANGINFO
+static bool hard_LC_TIME;
+#endif
+
+/* The kind of blanks for '-b' to skip in various options. */
+enum blanktype { bl_start, bl_end, bl_both };
+
+/* Table of blanks.  */
+static bool blanks[UCHAR_LIM];
+
+/* Table of non-printing characters. */
+static bool nonprinting[UCHAR_LIM];
+
+/* Table of non-dictionary characters (not letters, digits, or blanks). */
+static bool nondictionary[UCHAR_LIM];
+
+/* Translation table folding lower case to upper.  */
+static unsigned char fold_toupper[UCHAR_LIM];
+
+#define MONTHS_PER_YEAR 12
+
+struct month
+{
+  char const *name;
+  int val;
+};
+
+/* Table mapping month names to integers.
+   Alphabetic order allows binary search. */
+static struct month monthtab[] =
+{
+  {"APR", 4},
+  {"AUG", 8},
+  {"DEC", 12},
+  {"FEB", 2},
+  {"JAN", 1},
+  {"JUL", 7},
+  {"JUN", 6},
+  {"MAR", 3},
+  {"MAY", 5},
+  {"NOV", 11},
+  {"OCT", 10},
+  {"SEP", 9}
+};
+
+#if HAVE_NL_LANGINFO
+
+static int
+struct_month_cmp (void const *m1, void const *m2)
+{
+  struct month const *month1 = m1;
+  struct month const *month2 = m2;
+  return strcmp (month1->name, month2->name);
+}
+
+#endif
+
+/* Initialize the character class tables. */
+
+static void
+inittables (void)
+{
+  size_t i;
+
+  for (i = 0; i < UCHAR_LIM; ++i)
+    {
+      blanks[i] = !! isblank (i);
+      nonprinting[i] = ! isprint (i);
+      nondictionary[i] = ! isalnum (i) && ! isblank (i);
+      fold_toupper[i] = toupper (i);
+    }
+
+#if HAVE_NL_LANGINFO
+  /* If we're not in the "C" locale, read different names for months.  */
+  if (hard_LC_TIME)
+    {
+      for (i = 0; i < MONTHS_PER_YEAR; i++)
+        {
+          char const *s;
+          size_t s_len;
+          size_t j, k;
+          char *name;
+
+          s = nl_langinfo (ABMON_1 + i);
+          s_len = strlen (s);
+          monthtab[i].name = name = xmalloc (s_len + 1);
+          monthtab[i].val = i + 1;
+
+          for (j = k = 0; j < s_len; j++)
+            if (! isblank (to_uchar (s[j])))
+              name[k++] = fold_toupper[to_uchar (s[j])];
+          name[k] = '\0';
+        }
+      qsort (monthtab, MONTHS_PER_YEAR, sizeof *monthtab, struct_month_cmp);
+    }
+#endif
+}
+
+/* Table that maps characters to order-of-magnitude values.  */
+static char const unit_order[UCHAR_LIM] =
+  {
+#if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
+     && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107)
+    /* This initializer syntax works on all C99 hosts.  For now, use
+       it only on non-ASCII hosts, to ease the pain of porting to
+       pre-C99 ASCII hosts.  */
+    ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
+    ['k']=1,
+#else
+    /* Generate the following table with this command:
+       perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
+       foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
+       |fmt  */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
+    0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+#endif
+  };
+
+/* Return an integer that represents the order of magnitude of the
+   unit following the number.  The number may contain thousands
+   separators and a decimal point, but it may not contain leading blanks.
+   Negative numbers get negative orders; zero numbers have a zero order.  */
+
+static int
+find_unit_order (char const *number)
+{
+  bool minus_sign = (*number == '-');
+  char const *p = number + minus_sign;
+  int nonzero = 0;
+  unsigned char ch;
+
+  /* Scan to end of number.
+     Decimals or separators not followed by digits stop the scan.
+     Numbers ending in decimals or separators are thus considered
+     to be lacking in units.
+     FIXME: add support for multibyte thousands_sep and decimal_point.  */
+
+  do
+    {
+      while (ISDIGIT (ch = *p++))
+        nonzero |= ch - '0';
+    }
+  while (ch == thousands_sep);
+
+  if (ch == decimal_point)
+    while (ISDIGIT (ch = *p++))
+      nonzero |= ch - '0';
+
+  if (nonzero)
+    {
+      int order = unit_order[ch];
+      return (minus_sign ? -order : order);
+    }
+  else
+    return 0;
+}
+
+/* Compare numbers A and B ending in units with SI or IEC prefixes
+       <none/unknown> < K/k < M < G < T < P < E < Z < Y  */
+
+static int
+human_numcompare (char const *a, char const *b)
+{
+  while (blanks[to_uchar (*a)])
+    a++;
+  while (blanks[to_uchar (*b)])
+    b++;
+
+  int diff = find_unit_order (a) - find_unit_order (b);
+  return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
+}
+
+/* Compare strings A and B as numbers without explicitly converting them to
+   machine numbers.  Comparatively slow for short strings, but asymptotically
+   hideously fast. */
+
+static int
+numcompare (char const *a, char const *b)
+{
+  while (blanks[to_uchar (*a)])
+    a++;
+  while (blanks[to_uchar (*b)])
+    b++;
+
+  return strnumcmp (a, b, decimal_point, thousands_sep);
+}
+
+static int
+general_numcompare (char const *sa, char const *sb)
+{
+  /* FIXME: maybe add option to try expensive FP conversion
+     only if A and B can't be compared more cheaply/accurately.  */
+
+  char *ea;
+  char *eb;
+  long_double a = strtold (sa, &ea);
+  long_double b = strtold (sb, &eb);
+
+  /* Put conversion errors at the start of the collating sequence.  */
+  if (sa == ea)
+    return sb == eb ? 0 : -1;
+  if (sb == eb)
+    return 1;
+
+  /* Sort numbers in the usual way, where -0 == +0.  Put NaNs after
+     conversion errors but before numbers; sort them by internal
+     bit-pattern, for lack of a more portable alternative.  */
+  return (a < b ? -1
+          : a > b ? 1
+          : a == b ? 0
+          : b == b ? -1
+          : a == a ? 1
+          : memcmp (&a, &b, sizeof a));
+}
+
+/* Return an integer in 1..12 of the month name MONTH.
+   Return 0 if the name in S is not recognized.  */
+
+static int
+getmonth (char const *month, char **ea)
+{
+  size_t lo = 0;
+  size_t hi = MONTHS_PER_YEAR;
+
+  while (blanks[to_uchar (*month)])
+    month++;
+
+  do
+    {
+      size_t ix = (lo + hi) / 2;
+      char const *m = month;
+      char const *n = monthtab[ix].name;
+
+      for (;; m++, n++)
+        {
+          if (!*n)
+            {
+              if (ea)
+                *ea = (char *) m;
+              return monthtab[ix].val;
+            }
+          if (fold_toupper[to_uchar (*m)] < to_uchar (*n))
+            {
+              hi = ix;
+              break;
+            }
+          else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
+            {
+              lo = ix + 1;
+              break;
+            }
+        }
+    }
+  while (lo < hi);
+
+  return 0;
+}
+
+/* Import from copy.c ends here */
+
 /* An element of the list identifying which fields to print for each
    output line.  */
 struct outlist
@@ -201,6 +503,24 @@ by whitespace.  When FILE1 or FILE2 (not both) is -, read standard input.\n\
   --header          treat the first line in each file as field headers,\n\
                       print them without trying to pair them\n\
 "), stdout);
+      fputs (_("\
+  -b, --ignore-leading-blanks  ignore leading blanks\n\
+  -d, --dictionary-order      consider only blanks and alphanumeric characters\n\
+  -f, --ignore-case           fold lower case to upper case characters\n\
+"), stdout);
+      fputs (_("\
+  -g, --general-numeric-sort  compare according to general numerical value\n\
+  -h, --human-numeric-sort    compare human readable numbers (e.g., 2K 1G)\n\
+  -n, --numeric-sort          compare according to string numerical value\n\
+  -M, --month-sort            compare (unknown) < `JAN' < ... < `DEC'\n\
+  -V, --version-sort          natural sort of (version) numbers within text\n\
+"), stdout);
+      fputs (_("\
+      --sort=WORD             sort according to WORD:\n\
+                                general-numeric -g, human-numeric -h, month -M,\n\
+                                numeric -n, version -V\n\
+\n\
+"), stdout);
       fputs (HELP_OPTION_DESCRIPTION, stdout);
       fputs (VERSION_OPTION_DESCRIPTION, stdout);
       fputs (_("\
@@ -329,6 +649,16 @@ keycmp (struct line const *line1, struct line const *line2,
       len2 = 0;
     }
 
+/* The following ifdef'ed part allowed to solve the problem
+ * http://debbugs.gnu.org/cgi/bugreport.cgi?bug=6903
+ * describes.
+ * A real solution will need lots of inprovements here.
+ */
+
+#undef KEY_NUMCMP
+#ifdef KEY_NUMCMP
+  diff = numcompare (beg1, beg2);
+#else
   if (len1 == 0)
     return len2 == 0 ? 0 : -1;
   if (len2 == 0)
@@ -343,13 +673,15 @@ keycmp (struct line const *line1, struct line const *line2,
   else
     {
       if (hard_LC_COLLATE)
-        return xmemcoll (beg1, len1, beg2, len2);
-      diff = memcmp (beg1, beg2, MIN (len1, len2));
+        diff = xmemcoll (beg1, len1, beg2, len2);
+	  else
+        diff = memcmp (beg1, beg2, MIN (len1, len2));
     }
 
-  if (diff)
-    return diff;
-  return len1 < len2 ? -1 : len1 != len2;
+  if (! diff)
+    diff = len1 < len2 ? -1 : len1 != len2;
+#endif
+return diff;
 }
 
 /* Check that successive input lines PREV and CURRENT from input file
@@ -975,6 +1307,8 @@ main (int argc, char **argv)
   issued_disorder_warning[0] = issued_disorder_warning[1] = false;
   check_input_order = CHECK_ORDER_DEFAULT;
 
+  inittables();
+
   while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:",
                               longopts, NULL))
          != -1)

bug#6903: join: improve paralleles to sort?

Reply via email to