On 18/05/18 07:49, Daniel Kahn Gillmor wrote:
> Hi GNU coreutils developers!
> 
> This is a feature request for the hashing/digesting tools in coreutils,
> like sha256sum, sha1sum, md5sum, etc.
> 
> currently, those tools emit line-oriented output about the files that
> they hash.  if a file has a newline in the name, those tools emit output
> with a leading /, like so:
> 
> 0 dkg@alice:~$ sha256sum a*
> 348df4eb47f9230bfe89637afe7409bec883424d822257b6cbbce93ee780d992  a
> 98ea6e4f216f2fb4b69fff9b3a44842c38686ca685f3f55dc48c5d3fb1107be4  a   b
> \98ea6e4f216f2fb4b69fff9b3a44842c38686ca685f3f55dc48c5d3fb1107be4  a\nb\n
> 0 dkg@alice:~$ 
> 
> most users of sha256sum (etc) in shell scripts probably are unaware of
> this behavior and just cross their fingers and hope no file has a
> newline in its filename.
> 
> However, other common tools have processing options that use
> NUL-terminated records. For example, sort (-z) and find (-print0) and
> xargs (-0)
> 
> It would be nice if sha256sum (etc) had a comparable functionality to be
> able to fit better into pipelines with these other tools.
> 
> So i'm proposing a flag -z, --zero-terminated for these tools that
> alters the output (and input) format, to use NUL chars instead of
> newlines.  presumably when that flag is used, the emitted data should
> *not* be backslash-escaped.

I used --zero rather than --zero-terminated like other tools that
only change _output_ format with -z, rather than supporting it also
as an _input_ format. It would complicate the processing to support
--check of input with --zero, with no added functionality.
Having --check support this format, would also an incompat with
older versions of these tools.

As an example of processing now possible, the following would
robustly pass duplicate files to xargs for processing:

  $ md5sum src/md5sum.{c,c} -z |
    sort -z | uniq -z -d -w32 | cut -z -c35- |
    xargs -r0

    src/md5sum.c

cheers,
Pádraig
From 53acd6c16a8c3af60dca17eeed50a6cf0c782904 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <p...@draigbrady.com>
Date: Sun, 10 Jun 2018 17:45:35 -0700
Subject: [PATCH] md5sum,b2sum,sha*sum: support -z,--zero option

* doc/coreutils.texi (md5sum invocation): Describe the new option,
and how it's not supported by --check, and how it disables escaping.
* src/md5sum.c (delim): A new global to parmeterize the out delimiter.
(main): Don't enable file name escaping with -z, and output '\0'.
* tests/misc/md5sum-newline.pl: Add a test case.
* NEWS: Mention the new feature.
---
 NEWS                         |  6 ++++++
 doc/coreutils.texi           | 13 ++++++++-----
 src/md5sum.c                 | 27 +++++++++++++++++++++++----
 tests/misc/md5sum-newline.pl |  2 ++
 4 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/NEWS b/NEWS
index 101afc0..e0f01ce 100644
--- a/NEWS
+++ b/NEWS
@@ -46,6 +46,12 @@ GNU coreutils NEWS                                    -*- outline -*-
   'cp --force file symlink' now removes the symlink even if
   it is self referential.
 
+** New features
+
+  md5sum accepts a new option: --zero (-z) to delimit the output lines with a
+  NUL instead of a newline character.  This also disables file name escaping.
+  This also applies to sha*sum and b2sum.
+
 ** Improvements
 
   cut supports line lengths up to the max file size on 32 bit systems.
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index c28b8d0..b18c6a5 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3865,8 +3865,8 @@ a space, a flag indicating binary or text input mode, and the file name.
 Binary mode is indicated with @samp{*}, text mode with @samp{ } (space).
 Binary mode is the default on systems where it's significant,
 otherwise text mode is the default.
-If @var{file} contains a backslash or newline, the
-line is started with a backslash, and each problematic character in
+Without @option{--zero}, if @var{file} contains a backslash or newline,
+the line is started with a backslash, and each problematic character in
 the file name is escaped with a backslash, making the output
 unambiguous even in the presence of arbitrary file names.
 If @var{file} is omitted or specified as @samp{-}, standard input is read.
@@ -3899,6 +3899,7 @@ Three input formats are supported.  Either the default output
 format described above, the @option{--tag} output format,
 or the BSD reversed mode format which is similar to the default mode,
 but doesn't use a character to distinguish binary and text modes.
+Output with @option{--zero} enabled is not supported by @option{--check}.
 @sp 1
 For each such line, @command{md5sum} reads the named file and computes its
 MD5 checksum.  Then, if the computed message digest does not match the
@@ -3947,9 +3948,9 @@ indicating there was a failure.
 @opindex --tag
 @cindex BSD output
 Output BSD style checksums, which indicate the checksum algorithm used.
-As a GNU extension, file names with problematic characters
-are escaped as described above, with the same escaping indicator of @samp{\}
-at the start of the line, being used.
+As a GNU extension, if @option{--zero} is not used, file names with problematic
+characters are escaped as described above, with the same escaping indicator of
+@samp{\} at the start of the line, being used.
 The @option{--tag} option implies binary mode, and is disallowed with
 @option{--text} mode as supporting that would unnecessarily complicate
 the output format, while providing little benefit.
@@ -3982,6 +3983,8 @@ When verifying checksums,
 if one or more input line is invalid,
 exit nonzero after all warnings have been issued.
 
+@optZero
+Also file name escaping is not used.
 @end table
 
 @exitstatus
diff --git a/src/md5sum.c b/src/md5sum.c
index a5c30d9..e41fb24 100644
--- a/src/md5sum.c
+++ b/src/md5sum.c
@@ -157,6 +157,9 @@ static bool strict = false;
 /* Whether a BSD reversed format checksum is detected.  */
 static int bsd_reversed = -1;
 
+/* line delimiter.  */
+static unsigned char delim = '\n';
+
 #if HASH_ALGO_BLAKE2
 static char const *const algorithm_in_string[] =
 {
@@ -210,6 +213,7 @@ static struct option const long_options[] =
   { "warn", no_argument, NULL, 'w' },
   { "strict", no_argument, NULL, STRICT_OPTION },
   { "tag", no_argument, NULL, TAG_OPTION },
+  { "zero", no_argument, NULL, 'z' },
   { GETOPT_HELP_OPTION_DECL },
   { GETOPT_VERSION_OPTION_DECL },
   { NULL, 0, NULL, 0 }
@@ -263,6 +267,10 @@ Print or check %s (%d-bit) checksums.\n\
   -t, --text           read in text mode (default)\n\
 "), stdout);
       fputs (_("\
+  -z, --zero           end each output line with NUL, not newline,\n\
+                       and disable file name escaping\n\
+"), stdout);
+      fputs (_("\
 \n\
 The following five options are useful only when verifying checksums:\n\
       --ignore-missing  don't fail or report status for missing files\n\
@@ -875,10 +883,10 @@ main (int argc, char **argv)
   setvbuf (stdout, NULL, _IOLBF, 0);
 
 #if HASH_ALGO_BLAKE2
-  const char* short_opts = "l:bctw";
+  const char* short_opts = "l:bctwz";
   const char* b2_length_str = "";
 #else
-  const char* short_opts = "bctw";
+  const char* short_opts = "bctwz";
 #endif
 
   while ((opt = getopt_long (argc, argv, short_opts, long_options, NULL)) != -1)
@@ -930,6 +938,9 @@ main (int argc, char **argv)
         prefix_tag = true;
         binary = 1;
         break;
+      case 'z':
+        delim = '\0';
+        break;
       case_GETOPT_HELP_CHAR;
       case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
       default:
@@ -964,6 +975,13 @@ main (int argc, char **argv)
      usage (EXIT_FAILURE);
    }
 
+  if (delim != '\n' && do_check)
+    {
+      error (0, 0, _("the --zero option is not supported when "
+                     "verifying checksums"));
+      usage (EXIT_FAILURE);
+    }
+
   if (prefix_tag && do_check)
     {
       error (0, 0, _("the --tag option is meaningless when "
@@ -1043,7 +1061,8 @@ main (int argc, char **argv)
                  against old (hashed) outputs, in the presence of files
                  containing '\\' characters, we decided to not simplify the
                  output in this case.  */
-              bool needs_escape = strchr (file, '\\') || strchr (file, '\n');
+              bool needs_escape = (strchr (file, '\\') || strchr (file, '\n'))
+                                  && delim == '\n';
 
               if (prefix_tag)
                 {
@@ -1079,7 +1098,7 @@ main (int argc, char **argv)
                   print_filename (file, needs_escape);
                 }
 
-              putchar ('\n');
+              putchar (delim);
             }
         }
     }
diff --git a/tests/misc/md5sum-newline.pl b/tests/misc/md5sum-newline.pl
index b76e2b8..a7ab2bd 100755
--- a/tests/misc/md5sum-newline.pl
+++ b/tests/misc/md5sum-newline.pl
@@ -30,10 +30,12 @@ system ('touch', "a\nb") == 0
 
 my $degenerate = "d41d8cd98f00b204e9800998ecf8427e";
 my $t = '--text';
+my $z = '--zero';
 
 my @Tests =
     (
      ['newline', $t, {IN=> {"a\nb"=> ''}}, {OUT=>"\\$degenerate  a\\nb\n"}],
+     ['zero', $z, {IN=> {"a\nb"=> ''}}, {OUT=>"$degenerate  a\nb\0"}],
     );
 
 my $save_temps = $ENV{DEBUG};
-- 
2.9.3

Reply via email to