Hello,

This patch add "-z" to join, supporting joining zero-terminated lines.
The patch is heavily based on James Youngman's patch of adding -z to uniq 
(commit e062524).

-gordon

P.S.
This patch is independent of the key-comparison patches discussed recently, 
though I'm also adding it there.
>From 525eb72b150ed34d3bfcfe453d1494fe28a824b7 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Thu, 14 Feb 2013 15:29:08 -0500
Subject: [PATCH] join: Add -z option

* NEWS: Mention join's new option: --zero-terminated (-z).
* src/join.c: Add new option, --zero-terminated (-z), to make
join use the NUL byte as separator/delimiter rather than newline.
(get_line): Use readlinebuffer_delim in place of readlinebuffer.
(main): Handle the new option.
(usage): Describe new option the same way sort does.
* doc/coreutils.texi (join invocation): Describe the new option.
* tests/misc/join.pl: add tests for -z option.
---
 NEWS               |    6 ++++++
 doc/coreutils.texi |   17 +++++++++++++++++
 src/join.c         |   19 +++++++++++++++----
 tests/misc/join.pl |   20 ++++++++++++++++++++
 4 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/NEWS b/NEWS
index 37bcdf7..618c1da 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,12 @@ GNU coreutils NEWS                                    -*- outline -*-
 
 * Noteworthy changes in release ?.? (????-??-??) [?]
 
+** New features
+
+  join accepts a new option: --zero-terminated (-z). As with the sort,uniq
+  option of the same name, this makes join consume and produce NUL-terminated
+  lines rather than newline-terminated lines.
+
 
 * Noteworthy changes in release 8.21 (2013-02-14) [stable]
 
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 2c16dc4..a72d9ce 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -6059,6 +6059,10 @@ available; the sort order can be any order that considers two fields
 to be equal if and only if the sort comparison described above
 considers them to be equal.  For example:
 
+Input and output lines are terminated with a newline character unless the
+@option{--zero-terminated} (@option{-z}) is used, in which case lines are
+@sc{nul} terminated.
+
 @example
 $ cat file1
 a a1
@@ -6181,6 +6185,19 @@ character is used to delimit the fields.
 Print a line for each unpairable line in file @var{file-number}
 (either @samp{1} or @samp{2}), instead of the normal output.
 
+@item -z
+@itemx --zero-terminated
+@opindex -z
+@opindex --zero-terminated
+@cindex join zero-terminated lines
+Treat the input as a set of lines, each terminated by a null character
+(ASCII @sc{nul}) instead of a line feed
+(ASCII @sc{lf}).
+This option can be useful in conjunction with @samp{sort -z}, @samp{uniq -z},
+@samp{perl -0} or @samp{find -print0} and @samp{xargs -0} which do the same in
+order to reliably handle arbitrary file names (even those containing blanks
+or other special characters).
+
 @end table
 
 @exitstatus
diff --git a/src/join.c b/src/join.c
index 11e647c..1810ac2 100644
--- a/src/join.c
+++ b/src/join.c
@@ -161,6 +161,7 @@ static struct option const longopts[] =
   {"ignore-case", no_argument, NULL, 'i'},
   {"check-order", no_argument, NULL, CHECK_ORDER_OPTION},
   {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION},
+  {"zero-terminated", no_argument, NULL, 'z'},
   {"header", no_argument, NULL, HEADER_LINE_OPTION},
   {GETOPT_HELP_OPTION_DECL},
   {GETOPT_VERSION_OPTION_DECL},
@@ -177,6 +178,9 @@ static bool ignore_case;
    join them without checking for ordering */
 static bool join_header_lines;
 
+/* The character marking end of line. Default to \n. */
+static char eolchar = '\n';
+
 void
 usage (int status)
 {
@@ -213,6 +217,9 @@ by whitespace.  When FILE1 or FILE2 (not both) is -, read standard input.\n\
   --header          treat the first line in each file as field headers,\n\
                       print them without trying to pair them\n\
 "), stdout);
+      fputs (_("\
+  -z, --zero-terminated     end lines with 0 byte, not newline\n\
+"), stdout);
       fputs (HELP_OPTION_DESCRIPTION, stdout);
       fputs (VERSION_OPTION_DESCRIPTION, stdout);
       fputs (_("\
@@ -445,7 +452,7 @@ get_line (FILE *fp, struct line **linep, int which)
   else
     line = init_linep (linep);
 
-  if (! readlinebuffer (&line->buf, fp))
+  if (! readlinebuffer_delim (&line->buf, fp, eolchar))
     {
       if (ferror (fp))
         error (EXIT_FAILURE, errno, _("read error"));
@@ -614,7 +621,7 @@ prjoin (struct line const *line1, struct line const *line2)
             break;
           putchar (output_separator);
         }
-      putchar ('\n');
+      putchar (eolchar);
     }
   else
     {
@@ -636,7 +643,7 @@ prjoin (struct line const *line1, struct line const *line2)
       prfields (line1, join_field_1, autocount_1);
       prfields (line2, join_field_2, autocount_2);
 
-      putchar ('\n');
+      putchar (eolchar);
     }
 }
 
@@ -1017,7 +1024,7 @@ main (int argc, char **argv)
   issued_disorder_warning[0] = issued_disorder_warning[1] = false;
   check_input_order = CHECK_ORDER_DEFAULT;
 
-  while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:",
+  while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:z",
                               longopts, NULL))
          != -1)
     {
@@ -1107,6 +1114,10 @@ main (int argc, char **argv)
           }
           break;
 
+        case 'z':
+          eolchar = 0;
+          break;
+
         case NOCHECK_ORDER_OPTION:
           check_input_order = CHECK_ORDER_DISABLED;
           break;
diff --git a/tests/misc/join.pl b/tests/misc/join.pl
index 9b93794..c467054 100755
--- a/tests/misc/join.pl
+++ b/tests/misc/join.pl
@@ -275,6 +275,26 @@ my @tv = (
  [ "ID1 Name\n1 A\n", ""],
    "ID1 Name\n1 A\n", 0],
 
+# Zero-terminated lines
+['z1', '-z',
+ ["a\0c\0e\0", "a\0b\0c\0"], "a\0c\0", 0],
+
+# not zero-terminated, but related to the code change:
+#  the old readlinebuffer() auto-added '\n' to the last line.
+#  the new readlinebuffer_delim() does not.
+#  Ensure it doesn't matter.
+['z2', '',
+ ["a\nc\ne\n", "a\nb\nc"], "a\nc\n", 0],
+['z3', '',
+ ["a\nc\ne", "a\nb\nc"], "a\nc\n", 0],
+# missing last NUL at the end of the last line (=end of file)
+['z4', '-z',
+ ["a\0c\0e", "a\0b\0c"], "a\0c\0", 0],
+# edge-case: the embedded newlines should treated as
+# be part of the nul-terminated line
+['z5', '-z -a1 -a2',
+ ["a\n1\0c 3\0","b\n8\0c 9\0"], "a\n1\0b\n8\0c 3 9\0"],
+
 );
 
 # Convert the above old-style test vectors to the newer
-- 
1.7.7.4

Reply via email to