On 28/09/15 15:17, Stephane Chazelas wrote: > 2015-09-26 15:43:40 +0100, Richard Russon: >> I'd like to add an option to both head and tail, >> to allow them to work with NUL-terminated lines of text >> -z, --zero-terminated >> >> Thus allowing: >> >> find dir -type f -print0 | head -z -n 10 | xargs -0 command > [...] > > See also > > sed -z 10q > > as an alternative to > > head -zn 10 > > While we're at it, why not add it to every text utility (cut, > paste, seq, yes, tac...) for those that don't have it already?
Yes we've been adding -z support piecemeal over time, so I propose we add this as per the attached patch set to: wc, comm, cut, head, tail, tac, paste When looking at coreutils which might benefit from -z I split them into four categories: 1. One output item per input argument. NUL terminated input is catered for by xargs. NUL terminated output is handled with -z,--zero already for these utils: basename dirname du readlink realpath stat (handled with --printf='...\0') md5sum (\n is escaped) sha*sum (ditto) ls (ditto, also \0 supported by find) Possible additions to this class: cksum (obsolescent) sum (ditto) wc 2. Multiple output records per input file/stdin NUL terminated I/O is handled with -z,--zero-terminated already for these utils: join shuf sort uniq Possible additions to this class: comm cut head (especially since supports multiple files and seeking within them) tail (ditto) tac (ditto. extend -s to support '') paste nl (N/A as primarily text rather than record oriented) numfmt (ditto) expand (ditto) unexpand (ditto) fmt (N/A as word oriented rather than record oriented) fold (ditto) tsort (ditto) 3. Misc record processing id (handled already with -z) split (handled already with -t,--separator='\0') csplit (pattern based so only supports text) seq (might support -s '\0' but can't see need) yes (easy to handle this edge case with tr)
From 9da3cb61020fd58fde8dba6e8caf7ad101816797 Mon Sep 17 00:00:00 2001 From: Richard Russon <[email protected]> Date: Sat, 26 Sep 2015 14:22:26 +0100 Subject: [PATCH 1/7] head,tail: add the -z,--zero-terminated option * doc/coreutils.texi: Reference the option description. * src/head.c: Parameterize the delimiter character. * src/tail.c: Likewise. * tests/misc/head.pl: Add test case. * tests/misc/tail.pl: Likewise. * NEWS: Mention the new feature. --- NEWS | 3 +++ doc/coreutils.texi | 4 ++++ src/head.c | 31 ++++++++++++++++++++++++------- src/tail.c | 26 +++++++++++++++++++------- tests/misc/head.pl | 4 ++++ tests/misc/tail.pl | 4 ++++ 6 files changed, 58 insertions(+), 14 deletions(-) diff --git a/NEWS b/NEWS index 6e48a53..c88b4e3 100644 --- a/NEWS +++ b/NEWS @@ -39,6 +39,9 @@ GNU coreutils NEWS -*- outline -*- Its status=progress output now uses the same format as ordinary status, perhaps with trailing spaces to erase previous progress output. + head, tail now have -z, --zero-terminated options to work with + NUL delimited items. + md5sum now supports the --ignore-missing option to allow verifying a subset of files given a larger list of checksums. This also affects sha1sum, sha224sum, sha256sum, sha384sum and sha512sum. diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 2538062..2635fbe 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -2817,6 +2817,8 @@ Never print file name headers. @opindex --verbose Always print file name headers. +@optZeroTerminated + @end table For compatibility @command{head} also supports an obsolete option syntax @@ -3042,6 +3044,8 @@ every @var{number} seconds. @opindex --verbose Always print file name headers. +@optZeroTerminated + @end table For compatibility @command{tail} also supports an obsolete usage diff --git a/src/head.c b/src/head.c index a5405aa..282c2ea 100644 --- a/src/head.c +++ b/src/head.c @@ -58,6 +58,9 @@ static bool presume_input_pipe; /* If true, print filename headers. */ static bool print_headers; +/* Character to split lines by. */ +static char line_end; + /* When to print the filename banners. */ enum header_mode { @@ -90,6 +93,7 @@ static struct option const long_options[] = {"quiet", no_argument, NULL, 'q'}, {"silent", no_argument, NULL, 'q'}, {"verbose", no_argument, NULL, 'v'}, + {"zero-terminated", no_argument, NULL, 'z'}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -126,6 +130,9 @@ With more than one FILE, precede each with a header giving the file name.\n\ -q, --quiet, --silent never print headers giving file names\n\ -v, --verbose always print headers giving file names\n\ "), stdout); + fputs (_("\ + -z, --zero-terminated line delimiter is NUL, not newline\n\ +"), stdout); fputs (HELP_OPTION_DESCRIPTION, stdout); fputs (VERSION_OPTION_DESCRIPTION, stdout); fputs (_("\ @@ -532,7 +539,7 @@ elide_tail_lines_pipe (const char *filename, int fd, uintmax_t n_elide, { char const *buffer_end = tmp->buffer + n_read; char const *p = tmp->buffer; - while ((p = memchr (p, '\n', buffer_end - p))) + while ((p = memchr (p, line_end, buffer_end - p))) { ++p; ++tmp->nlines; @@ -581,7 +588,7 @@ elide_tail_lines_pipe (const char *filename, int fd, uintmax_t n_elide, /* If we read any bytes at all, count the incomplete line on files that don't end with a newline. */ - if (last->nbytes && last->buffer[last->nbytes - 1] != '\n') + if (last->nbytes && last->buffer[last->nbytes - 1] != line_end) { ++last->nlines; ++total_lines; @@ -600,7 +607,7 @@ elide_tail_lines_pipe (const char *filename, int fd, uintmax_t n_elide, size_t n = total_lines - n_elide; char const *buffer_end = tmp->buffer + tmp->nbytes; char const *p = tmp->buffer; - while (n && (p = memchr (p, '\n', buffer_end - p))) + while (n && (p = memchr (p, line_end, buffer_end - p))) { ++p; ++tmp->nlines; @@ -664,7 +671,7 @@ elide_tail_lines_seekable (const char *pretty_filename, int fd, const bool all_lines = !n_lines; /* Count the incomplete line on files that don't end with a newline. */ - if (n_lines && bytes_read && buffer[bytes_read - 1] != '\n') + if (n_lines && bytes_read && buffer[bytes_read - 1] != line_end) --n_lines; while (1) @@ -679,7 +686,7 @@ elide_tail_lines_seekable (const char *pretty_filename, int fd, else { char const *nl; - nl = memrchr (buffer, '\n', n); + nl = memrchr (buffer, line_end, n); if (nl == NULL) break; n = nl - buffer; @@ -804,7 +811,7 @@ head_lines (const char *filename, int fd, uintmax_t lines_to_write) if (bytes_read == 0) break; while (bytes_to_write < bytes_read) - if (buffer[bytes_to_write++] == '\n' && --lines_to_write == 0) + if (buffer[bytes_to_write++] == line_end && --lines_to_write == 0) { off_t n_bytes_past_EOL = bytes_read - bytes_to_write; /* If we have read more data than that on the specified number @@ -942,6 +949,8 @@ main (int argc, char **argv) print_headers = false; + line_end = '\n'; + if (1 < argc && argv[1][0] == '-' && ISDIGIT (argv[1][1])) { char *a = argv[1]; @@ -986,6 +995,10 @@ main (int argc, char **argv) header_mode = always; break; + case 'z': + line_end = '\0'; + break; + default: error (0, 0, _("invalid trailing option -- %c"), *a); usage (EXIT_FAILURE); @@ -1006,7 +1019,7 @@ main (int argc, char **argv) argc--; } - while ((c = getopt_long (argc, argv, "c:n:qv0123456789", long_options, NULL)) + while ((c = getopt_long (argc, argv, "c:n:qvz0123456789", long_options, NULL)) != -1) { switch (c) @@ -1039,6 +1052,10 @@ main (int argc, char **argv) header_mode = always; break; + case 'z': + line_end = '\0'; + break; + case_GETOPT_HELP_CHAR; case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); diff --git a/src/tail.c b/src/tail.c index 9007888..781adf2 100644 --- a/src/tail.c +++ b/src/tail.c @@ -180,6 +180,9 @@ static bool from_start; /* If true, print filename headers. */ static bool print_headers; +/* Character to split lines by. */ +static char line_end; + /* When to print the filename banners. */ enum header_mode { @@ -238,6 +241,7 @@ static struct option const long_options[] = {"silent", no_argument, NULL, 'q'}, {"sleep-interval", required_argument, NULL, 's'}, {"verbose", no_argument, NULL, 'v'}, + {"zero-terminated", no_argument, NULL, 'z'}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -297,6 +301,9 @@ With more than one FILE, precede each with a header giving the file name.\n\ least once every N seconds\n\ -v, --verbose always output headers giving file names\n\ "), stdout); + fputs (_("\ + -z, --zero-terminated line delimiter is NUL, not newline\n\ +"), stdout); fputs (HELP_OPTION_DESCRIPTION, stdout); fputs (VERSION_OPTION_DESCRIPTION, stdout); fputs (_("\ @@ -499,7 +506,7 @@ file_lines (const char *pretty_filename, int fd, uintmax_t n_lines, *read_pos = pos + bytes_read; /* Count the incomplete line on files that don't end with a newline. */ - if (bytes_read && buffer[bytes_read - 1] != '\n') + if (bytes_read && buffer[bytes_read - 1] != line_end) --n_lines; do @@ -510,7 +517,7 @@ file_lines (const char *pretty_filename, int fd, uintmax_t n_lines, while (n) { char const *nl; - nl = memrchr (buffer, '\n', n); + nl = memrchr (buffer, line_end, n); if (nl == NULL) break; n = nl - buffer; @@ -595,7 +602,7 @@ pipe_lines (const char *pretty_filename, int fd, uintmax_t n_lines, { char const *buffer_end = tmp->buffer + n_read; char const *p = tmp->buffer; - while ((p = memchr (p, '\n', buffer_end - p))) + while ((p = memchr (p, line_end, buffer_end - p))) { ++p; ++tmp->nlines; @@ -649,7 +656,7 @@ pipe_lines (const char *pretty_filename, int fd, uintmax_t n_lines, goto free_lbuffers; /* Count the incomplete line on files that don't end with a newline. */ - if (last->buffer[last->nbytes - 1] != '\n') + if (last->buffer[last->nbytes - 1] != line_end) { ++last->nlines; ++total_lines; @@ -671,7 +678,7 @@ pipe_lines (const char *pretty_filename, int fd, uintmax_t n_lines, size_t j; for (j = total_lines - n_lines; j; --j) { - beg = memchr (beg, '\n', buffer_end - beg); + beg = memchr (beg, line_end, buffer_end - beg); assert (beg); ++beg; } @@ -857,7 +864,7 @@ start_lines (const char *pretty_filename, int fd, uintmax_t n_lines, *read_pos += bytes_read; char *p = buffer; - while ((p = memchr (p, '\n', buffer_end - p))) + while ((p = memchr (p, line_end, buffer_end - p))) { ++p; if (--n_lines == 0) @@ -2047,7 +2054,7 @@ parse_options (int argc, char **argv, { int c; - while ((c = getopt_long (argc, argv, "c:n:fFqs:v0123456789", + while ((c = getopt_long (argc, argv, "c:n:fFqs:vz0123456789", long_options, NULL)) != -1) { @@ -2124,6 +2131,10 @@ parse_options (int argc, char **argv, *header_mode = always; break; + case 'z': + line_end = '\0'; + break; + case_GETOPT_HELP_CHAR; case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); @@ -2221,6 +2232,7 @@ main (int argc, char **argv) count_lines = true; forever = from_start = print_headers = false; + line_end = '\n'; obsolete_option = parse_obsolete_option (argc, argv, &n_units); argc -= obsolete_option; argv += obsolete_option; diff --git a/tests/misc/head.pl b/tests/misc/head.pl index 78644f2..1f565cf 100755 --- a/tests/misc/head.pl +++ b/tests/misc/head.pl @@ -72,6 +72,10 @@ my @Tests = ['no-oct-2', '-010', {IN=>"\n"x12}, {OUT=>"\n"x10}], ['no-oct-3', '-n 08', {IN=>"\n"x12}, {OUT=>"\n"x8}], ['no-oct-4', '-c 08', {IN=>"\n"x12}, {OUT=>"\n"x8}], + + # --zero-terminated + ['zero-1', '-z -n 1', {IN=>"x\0y"}, {OUT=>"x\0"}], + ['zero-2', '-z -n 2', {IN=>"x\0y"}, {OUT=>"x\0y"}], ); @Tests = triple_test \@Tests; diff --git a/tests/misc/tail.pl b/tests/misc/tail.pl index c23102f..0d9bc48 100755 --- a/tests/misc/tail.pl +++ b/tests/misc/tail.pl @@ -101,6 +101,10 @@ my @tv = ( # With textutils-1.22, this failed. ['f-pipe-1', '-f -n 1', "a\nb\n", "b\n", 0], + +# --zero-terminated +['zero-1', '-z -n 1', "x\0y", "y", 0], +['zero-2', '-z -n 2', "x\0y", "x\0y", 0], ); my @Tests; -- 2.5.0 From 92d159dd43796d36b210a4ae8d493919cd4abda9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]> Date: Fri, 8 Jan 2016 13:04:03 +0000 Subject: [PATCH 2/7] cut: add the -z,--zero-terminated option * doc/coreutils.texi (cut invocation): Reference the description. * src/cut.c: Parameterize '\n' references. * tests/misc/cut.pl: Add tests for character and field processing. * NEWS: Mention the new feature. --- NEWS | 6 +++--- doc/coreutils.texi | 2 ++ src/cut.c | 42 +++++++++++++++++++++++++++--------------- tests/misc/cut.pl | 8 ++++++++ 4 files changed, 40 insertions(+), 18 deletions(-) diff --git a/NEWS b/NEWS index c88b4e3..22df138 100644 --- a/NEWS +++ b/NEWS @@ -33,15 +33,15 @@ GNU coreutils NEWS -*- outline -*- ** New features + cut, head, tail now have -z, --zero-terminated options to work with + NUL delimited items. + dd now summarizes sizes in --human-readable format too, not just --si. E.g., "3441325000 bytes (3.4 GB, 3.2 GiB) copied". It omits the summaries if they would not provide useful information, e.g., "3 bytes copied". Its status=progress output now uses the same format as ordinary status, perhaps with trailing spaces to erase previous progress output. - head, tail now have -z, --zero-terminated options to work with - NUL delimited items. - md5sum now supports the --ignore-missing option to allow verifying a subset of files given a larger list of checksums. This also affects sha1sum, sha224sum, sha256sum, sha384sum and sha512sum. diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 2635fbe..fd4322e 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -5898,6 +5898,8 @@ In other words, do @emph{not} print the bytes, characters or fields specified via those options. This option is useful when you have many fields and want to print all but a few of them. +@optZeroTerminated + @end table @exitstatus diff --git a/src/cut.c b/src/cut.c index 96440af..7ab6be4 100644 --- a/src/cut.c +++ b/src/cut.c @@ -98,6 +98,9 @@ static bool complement; /* The delimiter character for field mode. */ static unsigned char delim; +/* The delimiter for each line/record. */ +static unsigned char line_delim = '\n'; + /* True if the --output-delimiter=STRING option was specified. */ static bool output_delimiter_specified; @@ -128,6 +131,7 @@ static struct option const longopts[] = {"only-delimited", no_argument, NULL, 's'}, {"output-delimiter", required_argument, NULL, OUTPUT_DELIMITER_OPTION}, {"complement", no_argument, NULL, COMPLEMENT_OPTION}, + {"zero-terminated", no_argument, NULL, 'z'}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -171,6 +175,9 @@ Print selected parts of lines from each FILE to standard output.\n\ --output-delimiter=STRING use STRING as the output delimiter\n\ the default is to use the input delimiter\n\ "), stdout); + fputs (_("\ + -z, --zero-terminated line delimiter is NUL, not newline\n\ +"), stdout); fputs (HELP_OPTION_DESCRIPTION, stdout); fputs (VERSION_OPTION_DESCRIPTION, stdout); fputs (_("\ @@ -239,9 +246,9 @@ cut_bytes (FILE *stream) c = getc (stream); - if (c == '\n') + if (c == line_delim) { - putchar ('\n'); + putchar (c); byte_idx = 0; print_delimiter = false; current_rp = frp; @@ -249,7 +256,7 @@ cut_bytes (FILE *stream) else if (c == EOF) { if (byte_idx > 0) - putchar ('\n'); + putchar (line_delim); break; } else @@ -308,7 +315,7 @@ cut_fields (FILE *stream) size_t n_bytes; len = getndelim2 (&field_1_buffer, &field_1_bufsize, 0, - GETNLINE_NO_LIMIT, delim, '\n', stream); + GETNLINE_NO_LIMIT, delim, line_delim, stream); if (len < 0) { free (field_1_buffer); @@ -336,9 +343,9 @@ cut_fields (FILE *stream) { fwrite (field_1_buffer, sizeof (char), n_bytes, stdout); /* Make sure the output line is newline terminated. */ - if (field_1_buffer[n_bytes - 1] != '\n') - putchar ('\n'); - c = '\n'; + if (field_1_buffer[n_bytes - 1] != line_delim) + putchar (line_delim); + c = line_delim; } continue; } @@ -348,7 +355,7 @@ cut_fields (FILE *stream) fwrite (field_1_buffer, sizeof (char), n_bytes - 1, stdout); /* With -d$'\n' don't treat the last '\n' as a delimiter. */ - if (delim == '\n') + if (delim == line_delim) { int last_c = getc (stream); if (last_c != EOF) @@ -374,7 +381,7 @@ cut_fields (FILE *stream) } found_any_selected_field = true; - while ((c = getc (stream)) != delim && c != '\n' && c != EOF) + while ((c = getc (stream)) != delim && c != line_delim && c != EOF) { putchar (c); prev_c = c; @@ -382,14 +389,14 @@ cut_fields (FILE *stream) } else { - while ((c = getc (stream)) != delim && c != '\n' && c != EOF) + while ((c = getc (stream)) != delim && c != line_delim && c != EOF) { prev_c = c; } } /* With -d$'\n' don't treat the last '\n' as a delimiter. */ - if (delim == '\n' && c == delim) + if (delim == line_delim && c == delim) { int last_c = getc (stream); if (last_c != EOF) @@ -400,13 +407,14 @@ cut_fields (FILE *stream) if (c == delim) next_item (&field_idx); - else if (c == '\n' || c == EOF) + else if (c == line_delim || c == EOF) { if (found_any_selected_field || !(suppress_non_delimited && field_idx == 1)) { - if (c == '\n' || prev_c != '\n' || delim == '\n') - putchar ('\n'); + if (c == line_delim || prev_c != line_delim + || delim == line_delim) + putchar (line_delim); } if (c == EOF) break; @@ -492,7 +500,7 @@ main (int argc, char **argv) delim = '\0'; have_read_stdin = false; - while ((optc = getopt_long (argc, argv, "b:c:d:f:ns", longopts, NULL)) != -1) + while ((optc = getopt_long (argc, argv, "b:c:d:f:nsz", longopts, NULL)) != -1) { switch (optc) { @@ -538,6 +546,10 @@ main (int argc, char **argv) suppress_non_delimited = true; break; + case 'z': + line_delim = '\0'; + break; + case COMPLEMENT_OPTION: complement = true; break; diff --git a/tests/misc/cut.pl b/tests/misc/cut.pl index 70c5a64..f6f8a56 100755 --- a/tests/misc/cut.pl +++ b/tests/misc/cut.pl @@ -161,6 +161,14 @@ my @Tests = ['newline-23', "-d'\n'", '-f1-', '--ou=:', {IN=>"a\nb\n"}, {OUT=>"a:b\n"}], ['newline-24', "-d'\n'", '-f1,2', '--ou=:', {IN=>"a\nb\n"}, {OUT=>"a:b\n"}], + # --zero-terminated + ['zerot-1', "-z", '-c1', {IN=>"ab\0cd\0"}, {OUT=>"a\0c\0"}], + ['zerot-2', "-z", '-c1', {IN=>"ab\0cd"}, {OUT=>"a\0c\0"}], + ['zerot-3', '-z -f1-', {IN=>""}, {OUT=>""}], + ['zerot-4', '-z -d:', '-f1', {IN=>"a:1\0b:2"}, {OUT=>"a\0b\0"}], + ['zerot-5', '-z -d:', '-f1-', {IN=>"a1:\0:"}, {OUT=>"a1:\0:\0"}], + ['zerot-6', "-z -d ''", '-f1,2', '--ou=:', {IN=>"a\0b\0"}, {OUT=>"a:b\0"}], + # New functionality: ['out-delim1', '-c1-3,5-', '--output-d=:', {IN=>"abcdefg\n"}, {OUT=>"abc:efg\n"}], -- 2.5.0 From 675d9e9113377d7e7c5918f0f0b1353c1f368fba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]> Date: Fri, 8 Jan 2016 14:31:27 +0000 Subject: [PATCH 3/7] tac: support an empty (NUL) --separator * doc/coreutils.texi (tac invocation): Mention the NUL delineation with an empty --separator. * src/tac.c (main): Allow an empty separator when -r not specified. * tests/misc/tac.pl: Add test cases. * NEWS: Mention the new feature. Fixes http://bugs.gnu.org/8103 --- NEWS | 4 ++-- doc/coreutils.texi | 2 ++ src/tac.c | 7 ++++--- tests/misc/tac.pl | 7 +++++++ 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/NEWS b/NEWS index 22df138..a3b5636 100644 --- a/NEWS +++ b/NEWS @@ -33,8 +33,8 @@ GNU coreutils NEWS -*- outline -*- ** New features - cut, head, tail now have -z, --zero-terminated options to work with - NUL delimited items. + cut, head, tail now have the -z,--zero-terminated option, and + tac --separator accepts an empty argument, to work with NUL delimited items. dd now summarizes sizes in --human-readable format too, not just --si. E.g., "3441325000 bytes (3.4 GB, 3.2 GiB) copied". It omits the summaries diff --git a/doc/coreutils.texi b/doc/coreutils.texi index fd4322e..ba68416 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -1706,6 +1706,8 @@ Treat the separator string as a regular expression. @opindex -s @opindex --separator Use @var{separator} as the record separator, instead of newline. +Note an empty @var{separator} is treated as a zero byte. +I.e., input and output items are delimited with ASCII NUL. @end table diff --git a/src/tac.c b/src/tac.c index 2410224..4681f3a 100644 --- a/src/tac.c +++ b/src/tac.c @@ -639,8 +639,6 @@ main (int argc, char **argv) break; case 's': separator = optarg; - if (*separator == 0) - error (EXIT_FAILURE, 0, _("separator cannot be empty")); break; case_GETOPT_HELP_CHAR; case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); @@ -651,6 +649,9 @@ main (int argc, char **argv) if (sentinel_length == 0) { + if (*separator == 0) + error (EXIT_FAILURE, 0, _("separator cannot be empty")); + compiled_separator.buffer = NULL; compiled_separator.allocated = 0; compiled_separator.fastmap = compiled_separator_fastmap; @@ -661,7 +662,7 @@ main (int argc, char **argv) error (EXIT_FAILURE, 0, "%s", (error_message)); } else - match_length = sentinel_length = strlen (separator); + match_length = sentinel_length = *separator ? strlen (separator) : 1; read_size = INITIAL_READSIZE; while (sentinel_length >= read_size / 2) diff --git a/tests/misc/tac.pl b/tests/misc/tac.pl index 6297b16..fb76719 100755 --- a/tests/misc/tac.pl +++ b/tests/misc/tac.pl @@ -45,6 +45,13 @@ my @Tests = ['basic-j', '', {IN=>"1234\n8\n"}, {OUT=>"8\n1234\n"}], ['basic-k', '', {IN=>"123\n8\n"}, {OUT=>"8\n123\n"}], + ['nul-0', '-s ""', {IN=>""}, {OUT=>""}], + ['nul-a', '-s ""', {IN=>"a"}, {OUT=>"a"}], + ['nul-b', '-s ""', {IN=>"\0"}, {OUT=>"\0"}], + ['nul-c', '-s ""', {IN=>"a\0"}, {OUT=>"a\0"}], + ['nul-d', '-s ""', {IN=>"a\0b"}, {OUT=>"ba\0"}], + ['nul-e', '-s ""', {IN=>"a\0b\0"}, {OUT=>"b\0a\0"}], + ['opt-b', '-b', {IN=>"\na\nb\nc"}, {OUT=>"\nc\nb\na"}], ['opt-s', '-s:', {IN=>"a:b:c:"}, {OUT=>"c:b:a:"}], ['opt-sb', qw(-s : -b), {IN=>":a:b:c"}, {OUT=>":c:b:a"}], -- 2.5.0 From 5380b507b3d92f689ff99a23972180ffbfb7f7d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]> Date: Fri, 8 Jan 2016 15:14:01 +0000 Subject: [PATCH 4/7] comm: add the -z,--zero-terminated option * doc/coreutils.texi (comm invocation): Reference option description. * src/comm.c (main): Use readlinebuffer_delim() to support a parameterized delimiter. * tests/misc/comm.pl: Add test cases. * NEWS: Mention the new feature. --- NEWS | 2 +- doc/coreutils.texi | 2 ++ src/comm.c | 19 ++++++++++++++++--- tests/misc/comm.pl | 3 +++ 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/NEWS b/NEWS index a3b5636..30a1526 100644 --- a/NEWS +++ b/NEWS @@ -33,7 +33,7 @@ GNU coreutils NEWS -*- outline -*- ** New features - cut, head, tail now have the -z,--zero-terminated option, and + comm, cut, head, tail now have the -z,--zero-terminated option, and tac --separator accepts an empty argument, to work with NUL delimited items. dd now summarizes sizes in --human-readable format too, not just --si. diff --git a/doc/coreutils.texi b/doc/coreutils.texi index ba68416..99f0f2d 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -5126,6 +5126,8 @@ rather than the default of a single TAB character. The delimiter @var{str} may not be empty. +@optZeroTerminated + @end table @node ptx invocation diff --git a/src/comm.c b/src/comm.c index 89cee88..e66ac81 100644 --- a/src/comm.c +++ b/src/comm.c @@ -59,6 +59,9 @@ static bool seen_unpairable; /* If nonzero, we have warned about disorder in that file. */ static bool issued_disorder_warning[2]; +/* line delimiter. */ +static unsigned char delim = '\n'; + /* If nonzero, check that the input is correctly ordered. */ static enum { @@ -86,6 +89,7 @@ static struct option const long_options[] = {"check-order", no_argument, NULL, CHECK_ORDER_OPTION}, {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION}, {"output-delimiter", required_argument, NULL, OUTPUT_DELIMITER_OPTION}, + {"zero-terminated", no_argument, NULL, 'z'}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -131,6 +135,9 @@ and column three contains lines common to both files.\n\ fputs (_("\ --output-delimiter=STR separate columns with STR\n\ "), stdout); + fputs (_("\ + -z, --zero-terminated line delimiter is NUL, not newline\n\ +"), stdout); fputs (HELP_OPTION_DESCRIPTION, stdout); fputs (VERSION_OPTION_DESCRIPTION, stdout); fputs (_("\ @@ -277,7 +284,8 @@ compare_files (char **infiles) fadvise (streams[i], FADVISE_SEQUENTIAL); - thisline[i] = readlinebuffer (all_line[i][alt[i][0]], streams[i]); + thisline[i] = readlinebuffer_delim (all_line[i][alt[i][0]], streams[i], + delim); if (ferror (streams[i])) error (EXIT_FAILURE, errno, "%s", quotef (infiles[i])); } @@ -336,7 +344,8 @@ compare_files (char **infiles) alt[i][1] = alt[i][0]; alt[i][0] = (alt[i][0] + 1) & 0x03; - thisline[i] = readlinebuffer (all_line[i][alt[i][0]], streams[i]); + thisline[i] = readlinebuffer_delim (all_line[i][alt[i][0]], + streams[i], delim); if (thisline[i]) check_order (all_line[i][alt[i][1]], thisline[i], i + 1); @@ -382,7 +391,7 @@ main (int argc, char **argv) issued_disorder_warning[0] = issued_disorder_warning[1] = false; check_input_order = CHECK_ORDER_DEFAULT; - while ((c = getopt_long (argc, argv, "123", long_options, NULL)) != -1) + while ((c = getopt_long (argc, argv, "123z", long_options, NULL)) != -1) switch (c) { case '1': @@ -397,6 +406,10 @@ main (int argc, char **argv) both = false; break; + case 'z': + delim = '\0'; + break; + case NOCHECK_ORDER_OPTION: check_input_order = CHECK_ORDER_DISABLED; break; diff --git a/tests/misc/comm.pl b/tests/misc/comm.pl index 52d14ba..3232d63 100755 --- a/tests/misc/comm.pl +++ b/tests/misc/comm.pl @@ -28,14 +28,17 @@ my $prog = 'comm'; @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; my @inputs = ({IN=>{a=>"1\n3"}}, {IN=>{b=>"2\n3"}}); +my @zinputs = ({IN=>{za=>"1\0003"}}, {IN=>{zb=>"2\0003"}}); my @Tests = ( # basic operation ['basic', @inputs, {OUT=>"1\n\t2\n\t\t3\n"} ], + ['zbasic', '-z', @zinputs, {OUT=>"1\0\t2\0\t\t3\0"} ], # suppress lines unique to file 1 ['opt-1', '-1', @inputs, {OUT=>"2\n\t3\n"} ], + ['zopt-1', '-z', '-1', @zinputs, {OUT=>"2\0\t3\0"} ], # suppress lines unique to file 2 ['opt-2', '-2', @inputs, {OUT=>"1\n\t3\n"} ], -- 2.5.0 From d79b96f443388b81e65ee230e02b64b7484356f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]> Date: Fri, 8 Jan 2016 15:42:56 +0000 Subject: [PATCH 5/7] comm: support NUL --output-delimiter for consistency * src/comm.c (main): Track the output delimiter length, so that it can be adjusted to 1 for the NUL delimiter. Also rename the global variable from "delimiter" to "col_sep" so its use is more obvious, and to distinguish from the recently added "delim" global variable. * tests/misc/comm.pl: Adjust accordingly. --- src/comm.c | 34 +++++++++++++--------------------- tests/misc/comm.pl | 12 +++++++----- 2 files changed, 20 insertions(+), 26 deletions(-) diff --git a/src/comm.c b/src/comm.c index e66ac81..802bf90 100644 --- a/src/comm.c +++ b/src/comm.c @@ -71,9 +71,9 @@ static enum } check_input_order; /* Output columns will be delimited with this string, which may be set - on the command-line with --output-delimiter=STR. The default is a - single TAB character. */ -static char const *delimiter; + on the command-line with --output-delimiter=STR. */ +static char const *col_sep = "\t"; +static size_t col_sep_len = 0; /* For long options that have no equivalent short option, use a non-character as a pseudo short option, starting with CHAR_MAX + 1. */ @@ -174,20 +174,17 @@ writeline (struct linebuffer const *line, FILE *stream, int class) case 2: if (!only_file_2) return; - /* Print a delimiter if we are printing lines from file 1. */ if (only_file_1) - fputs (delimiter, stream); + fwrite (col_sep, 1, col_sep_len, stream); break; case 3: if (!both) return; - /* Print a delimiter if we are printing lines from file 1. */ if (only_file_1) - fputs (delimiter, stream); - /* Print a delimiter if we are printing lines from file 2. */ + fwrite (col_sep, 1, col_sep_len, stream); if (only_file_2) - fputs (delimiter, stream); + fwrite (col_sep, 1, col_sep_len, stream); break; } @@ -419,14 +416,10 @@ main (int argc, char **argv) break; case OUTPUT_DELIMITER_OPTION: - if (delimiter && !STREQ (delimiter, optarg)) - error (EXIT_FAILURE, 0, _("multiple delimiters specified")); - delimiter = optarg; - if (!*delimiter) - { - error (EXIT_FAILURE, 0, _("empty %s not allowed"), - quote ("--output-delimiter")); - } + if (col_sep_len && !STREQ (col_sep, optarg)) + error (EXIT_FAILURE, 0, _("multiple output delimiters specified")); + col_sep = optarg; + col_sep_len = *optarg ? strlen (optarg) : 1; break; case_GETOPT_HELP_CHAR; @@ -437,6 +430,9 @@ main (int argc, char **argv) usage (EXIT_FAILURE); } + if (! col_sep_len) + col_sep_len = 1; + if (argc - optind < 2) { if (argc <= optind) @@ -452,10 +448,6 @@ main (int argc, char **argv) usage (EXIT_FAILURE); } - /* The default delimiter is a TAB. */ - if (!delimiter) - delimiter = "\t"; - compare_files (argv + optind); if (issued_disorder_warning[0] || issued_disorder_warning[1]) diff --git a/tests/misc/comm.pl b/tests/misc/comm.pl index 3232d63..c5cd27f 100755 --- a/tests/misc/comm.pl +++ b/tests/misc/comm.pl @@ -134,13 +134,15 @@ my @Tests = ['delim-2char', '--output-delimiter=++', @inputs, {OUT=>"1\n++2\n++++3\n"} ], - # invalid empty delimiter - ['delim-empty', '--output-delimiter=', @inputs, {EXIT=>1}, - {ERR => "$prog: empty '--output-delimiter' not allowed\n"}], + # NUL delimiter + ['delim-empty', '--output-delimiter=', @inputs, + {OUT=>"1\n\0002\n\000\0003\n"} ], + ['zdelim-empty', '-z', '-z --output-delimiter=', @zinputs, + {OUT=>"1\000\0002\000\000\0003\000"} ], # invalid dual delimiter - ['delim-dual', '--output-delimiter=,', '--output-delimiter=+', - @inputs, {EXIT=>1}, {ERR => "$prog: multiple delimiters specified\n"}], + ['delim-dual', '--output-delimiter=,', '--output-delimiter=+', @inputs, + {EXIT=>1}, {ERR => "$prog: multiple output delimiters specified\n"}], # valid dual delimiter specification ['delim-dual2', '--output-delimiter=,', '--output-delimiter=,', @inputs, -- 2.5.0 From 77865899176cb67ea1f2b3e935a7a181f7ad77c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]> Date: Fri, 8 Jan 2016 15:57:06 +0000 Subject: [PATCH 6/7] paste: add the -z,--zero-terminated option * doc/coreutils.texi (paste invocation): Reference -z description. * src/paste.c (main): Parameterize the use of '\n'. * tests/misc/paste.pl: Add test cases. * NEWS: Mention the new feature. --- NEWS | 2 +- doc/coreutils.texi | 2 ++ src/paste.c | 26 ++++++++++++++++++-------- tests/misc/paste.pl | 10 ++++++++++ 4 files changed, 31 insertions(+), 9 deletions(-) diff --git a/NEWS b/NEWS index 30a1526..929dfcd 100644 --- a/NEWS +++ b/NEWS @@ -33,7 +33,7 @@ GNU coreutils NEWS -*- outline -*- ** New features - comm, cut, head, tail now have the -z,--zero-terminated option, and + comm, cut, head, paste, tail now have the -z,--zero-terminated option, and tac --separator accepts an empty argument, to work with NUL delimited items. dd now summarizes sizes in --human-readable format too, not just --si. diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 99f0f2d..e878474 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -6000,6 +6000,8 @@ $ paste -d '%_' num2 let3 num2 %c_ @end example +@optZeroTerminated + @end table @exitstatus diff --git a/src/paste.c b/src/paste.c index a5acecd..bf99fe0 100644 --- a/src/paste.c +++ b/src/paste.c @@ -67,10 +67,13 @@ static char *delims; /* A pointer to the character after the end of 'delims'. */ static char const *delim_end; +static unsigned char line_delim = '\n'; + static struct option const longopts[] = { {"serial", no_argument, NULL, 's'}, {"delimiters", required_argument, NULL, 'd'}, + {"zero-terminated", no_argument, NULL, 'z'}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -250,7 +253,7 @@ paste_parallel (size_t nfiles, char **fnamptr) while (chr != EOF) { sometodo = true; - if (chr == '\n') + if (chr == line_delim) break; xputchar (chr); chr = getc (fileptr[i]); @@ -295,7 +298,7 @@ paste_parallel (size_t nfiles, char **fnamptr) write_error (); delims_saved = 0; } - xputchar ('\n'); + xputchar (line_delim); } continue; /* Next read of files, or exit. */ } @@ -316,7 +319,7 @@ paste_parallel (size_t nfiles, char **fnamptr) /* Except for last file, replace last newline with delim. */ if (i + 1 != nfiles) { - if (chr != '\n' && chr != EOF) + if (chr != line_delim && chr != EOF) xputchar (chr); if (*delimptr != EMPTY_DELIM) xputchar (*delimptr); @@ -327,7 +330,7 @@ paste_parallel (size_t nfiles, char **fnamptr) { /* If the last line of the last file lacks a newline, print one anyhow. POSIX requires this. */ - char c = (chr == EOF ? '\n' : chr); + char c = (chr == EOF ? line_delim : chr); xputchar (c); } } @@ -386,7 +389,7 @@ paste_serial (size_t nfiles, char **fnamptr) while ((charnew = getc (fileptr)) != EOF) { /* Process the old character. */ - if (charold == '\n') + if (charold == line_delim) { if (*delimptr != EMPTY_DELIM) xputchar (*delimptr); @@ -405,8 +408,8 @@ paste_serial (size_t nfiles, char **fnamptr) xputchar (charold); } - if (charold != '\n') - xputchar ('\n'); + if (charold != line_delim) + xputchar (line_delim); if (ferror (fileptr)) { @@ -447,6 +450,9 @@ each FILE, separated by TABs, to standard output.\n\ -d, --delimiters=LIST reuse characters from LIST instead of TABs\n\ -s, --serial paste one file at a time instead of in parallel\n\ "), stdout); + fputs (_("\ + -z, --zero-terminated line delimiter is NUL, not newline\n\ +"), stdout); fputs (HELP_OPTION_DESCRIPTION, stdout); fputs (VERSION_OPTION_DESCRIPTION, stdout); /* FIXME: add a couple of examples. */ @@ -473,7 +479,7 @@ main (int argc, char **argv) have_read_stdin = false; serial_merge = false; - while ((optc = getopt_long (argc, argv, "d:s", longopts, NULL)) != -1) + while ((optc = getopt_long (argc, argv, "d:sz", longopts, NULL)) != -1) { switch (optc) { @@ -486,6 +492,10 @@ main (int argc, char **argv) serial_merge = true; break; + case 'z': + line_delim = '\0'; + break; + case_GETOPT_HELP_CHAR; case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); diff --git a/tests/misc/paste.pl b/tests/misc/paste.pl index b4409e7..7c95597 100755 --- a/tests/misc/paste.pl +++ b/tests/misc/paste.pl @@ -34,6 +34,11 @@ my @Tests = ['no-nl-3', {IN=>"a"}, {IN=>"b\n"}, {OUT=>"a\tb\n"}], ['no-nl-4', {IN=>"a\n"}, {IN=>"b\n"}, {OUT=>"a\tb\n"}], + ['zno-nl-1', '-z', {IN=>"a"}, {IN=>"b"}, {OUT=>"a\tb\0"}], + ['zno-nl-2', '-z', {IN=>"a\0"}, {IN=>"b"}, {OUT=>"a\tb\0"}], + ['zno-nl-3', '-z', {IN=>"a"}, {IN=>"b\0"}, {OUT=>"a\tb\0"}], + ['zno-nl-4', '-z', {IN=>"a\0"}, {IN=>"b\0"}, {OUT=>"a\tb\0"}], + # Same as above, but with a two lines in each input file and # the addition of the -d option to make SPACE be the output delimiter. ['no-nla1', '-d" "', {IN=>"1\na"}, {IN=>"2\nb"}, {OUT=>"1 2\na b\n"}], @@ -41,6 +46,11 @@ my @Tests = ['no-nla3', '-d" "', {IN=>"1\na"}, {IN=>"2\nb\n"}, {OUT=>"1 2\na b\n"}], ['no-nla4', '-d" "', {IN=>"1\na\n"}, {IN=>"2\nb\n"}, {OUT=>"1 2\na b\n"}], + ['zno-nla1', '-zd" "', {IN=>"1\0a"}, {IN=>"2\0b"}, {OUT=>"1 2\0a b\0"}], + ['zno-nla2', '-zd" "', {IN=>"1\0a\0"}, {IN=>"2\0b"}, {OUT=>"1 2\0a b\0"}], + ['zno-nla3', '-zd" "', {IN=>"1\0a"}, {IN=>"2\0b\0"}, {OUT=>"1 2\0a b\0"}], + ['zno-nla4', '-zd" "', {IN=>"1\0a\0"}, {IN=>"2\0b\0"}, {OUT=>"1 2\0a b\0"}], + # Specifying a delimiter with a trailing backslash would overrun a # malloc'd buffer. ['delim-bs1', q!-d'\'!, {IN=>{'a'x50=>''}}, {EXIT => 1}, -- 2.5.0 From 25d17125badcb603a9a303ca6a5f6912bfc01b41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]> Date: Fri, 8 Jan 2016 16:29:40 +0000 Subject: [PATCH 7/7] wc: add the -z,--zero option * doc/coreutils.texi (wc invocation): Reference the --zero description. * src/wc.c (main): Parse the --zero option. (write_counts): Write the "line_end" character instead of '\n'. * tests/misc/wc-files0.sh: Add a test case. * NEWS: Mention the new feature. --- NEWS | 2 +- doc/coreutils.texi | 2 ++ src/wc.c | 11 ++++++++++- tests/misc/wc-files0.sh | 8 ++++++++ 4 files changed, 21 insertions(+), 2 deletions(-) diff --git a/NEWS b/NEWS index 929dfcd..2079887 100644 --- a/NEWS +++ b/NEWS @@ -33,7 +33,7 @@ GNU coreutils NEWS -*- outline -*- ** New features - comm, cut, head, paste, tail now have the -z,--zero-terminated option, and + comm, cut, head, paste, tail, wc now have the -z,--zero-terminated option, and tac --separator accepts an empty argument, to work with NUL delimited items. dd now summarizes sizes in --human-readable format too, not just --si. diff --git a/doc/coreutils.texi b/doc/coreutils.texi index e878474..771418e 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -3677,6 +3677,8 @@ find . -name '*.[ch]' -print0 | wc -L --files0-from=- | tail -n1 @end example +@optZero + @end table @exitstatus diff --git a/src/wc.c b/src/wc.c index c2a9c3f..ea2c20f 100644 --- a/src/wc.c +++ b/src/wc.c @@ -70,6 +70,9 @@ static int number_width; /* True if we have ever read the standard input. */ static bool have_read_stdin; +/* Support NUL line endings. */ +static unsigned char line_end = '\n'; + /* The result of calling fstat or stat on a file descriptor or file. */ struct fstatus { @@ -96,6 +99,7 @@ static struct option const longopts[] = {"words", no_argument, NULL, 'w'}, {"files0-from", required_argument, NULL, FILES0_FROM_OPTION}, {"max-line-length", no_argument, NULL, 'L'}, + {"zero", no_argument, NULL, 'z'}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -135,6 +139,7 @@ the following order: newline, word, character, byte, maximum line length.\n\ If F is - then read names from standard input\n\ -L, --max-line-length print the maximum display width\n\ -w, --words print the word counts\n\ + -z, --zero end each output line with NUL, not newline\n\ "), stdout); fputs (HELP_OPTION_DESCRIPTION, stdout); fputs (VERSION_OPTION_DESCRIPTION, stdout); @@ -183,7 +188,7 @@ write_counts (uintmax_t lines, } if (file) printf (" %s", file); - putchar ('\n'); + putchar (line_end); } /* Count words. FILE_X is the name of the file (or NULL for standard @@ -669,6 +674,10 @@ main (int argc, char **argv) print_linelength = true; break; + case 'z': + line_end = '\0'; + break; + case FILES0_FROM_OPTION: files_from = optarg; break; diff --git a/tests/misc/wc-files0.sh b/tests/misc/wc-files0.sh index b6a204c..6220e72 100755 --- a/tests/misc/wc-files0.sh +++ b/tests/misc/wc-files0.sh @@ -40,4 +40,12 @@ if test "$fail" = ''; then compare exp out || fail=1 fi +if test "$fail" = ''; then + # Repeat the above test, but output NULs instead of newlines + rm -f out + tr '\n' '\0' < exp > zexp || framework_failure_ + wc -z --files0-from=- < names > out || fail=1 + compare exp out || fail=1 +fi + Exit $fail -- 2.5.0
