Re: [PATCH] csplit: new option --suppress-matched

Assaf Gordon Thu, 28 Mar 2013 15:10:26 -0700

Hello,


Assaf Gordon wrote, On 03/07/2013 05:39 PM:
> 
> Attached is a new option for csplit, suppress-matched, as been mentioned few 
> times before (e.g. 
> http://lists.gnu.org/archive/html/coreutils/2013-02/msg00170.html ).
> 

Attached updated version (works with both regexp and int patterns).
Also updated tests.

Comments are welcomed,
  -gordon

>From eec5cf679824ed67c8b751ecb90565a22fc51719 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <[email protected]>
Date: Wed, 6 Mar 2013 15:53:16 -0500
Subject: [PATCH] csplit: new option --suppress-matched

With --suppress-matched, the lines that match the pattern will not be
printed in the output files.

* src/csplit.c: implement --suppress-matched.
process_regexp(),process_line_count(): skip the matched lined without
printing. Since csplit always does "up to but not including" matched
lines, the first line (in the next group) is the matched line - just
skip it.
main(): handle new option.
usage(): mention new option.
* NEWS: mention new option.
* doc/coreutils.texi: mention new option, add examples.
* tests/misc/csplit-supress-matched.pl: test new option.
* tests/local.mk: add new test script.
---
 NEWS                                  |    3 +
 doc/coreutils.texi                    |   25 ++++
 src/csplit.c                          |   29 ++++-
 tests/local.mk                        |    1 +
 tests/misc/csplit-suppress-matched.pl |  213 +++++++++++++++++++++++++++++++++
 5 files changed, 268 insertions(+), 3 deletions(-)
 create mode 100644 tests/misc/csplit-suppress-matched.pl

diff --git a/NEWS b/NEWS
index 0c2daad..896512d 100644
--- a/NEWS
+++ b/NEWS
@@ -18,6 +18,9 @@ GNU coreutils NEWS                                    -*- outline -*-
   uniq accepts a new option: --group to print all items, while separating
   unique groups with empty lines.
 
+  csplit accepts a new option: --suppressed-matched (-m). Lines matching
+  the specified patterns will not be printed.
+
 ** Improvements
 
   stat and tail work better with EFIVARFS, EXOFS, F2FS and UBIFS.
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index dfa9b1c..7dfe724 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3607,6 +3607,12 @@ long instead of the default 2.
 @opindex --keep-files
 Do not remove output files when errors are encountered.
 
+@item -m
+@itemx --suppress-matched
+@opindex -m
+@opindex --suppress-matched
+Do not output lines matching the specified @var{pattern}.
+
 @item -z
 @itemx --elide-empty-files
 @opindex -z
@@ -3683,6 +3689,25 @@ $ head xx*
 14
 @end example
 
+Example of splitting input by empty lines:
+
+@example
+$ csplit --suppress-matched @var{input.txt} '/^$/' '@{*@}'
+@end example
+
+@c
+@c TODO: "uniq" already supportes "--group".
+@c        when it gets the "--key" option, uncomment this example.
+@c
+@c Example of splitting input file, based on the value of column 2:
+@c
+@c @example
+@c $ cat @var{input.txt} |
+@c       sort -k2,2 |
+@c       uniq --group -k2,2 |
+@c       csplit -m '/^$/' '@{*@}'
+@c @end example
+
 @node Summarizing files
 @chapter Summarizing files
 
diff --git a/src/csplit.c b/src/csplit.c
index 22f3ad4..4ae2de2 100644
--- a/src/csplit.c
+++ b/src/csplit.c
@@ -166,6 +166,9 @@ static bool volatile remove_files;
 /* If true, remove all output files which have a zero length. */
 static bool elide_empty_files;
 
+/* If true, suppress the lines that match the PATTERN */
+static bool suppress_matched;
+
 /* The compiled pattern arguments, which determine how to split
    the input file. */
 static struct control *controls;
@@ -185,6 +188,7 @@ static struct option const longopts[] =
   {"elide-empty-files", no_argument, NULL, 'z'},
   {"prefix", required_argument, NULL, 'f'},
   {"suffix-format", required_argument, NULL, 'b'},
+  {"suppress-matched", no_argument, NULL, 'm'},
   {GETOPT_HELP_OPTION_DECL},
   {GETOPT_VERSION_OPTION_DECL},
   {NULL, 0, NULL, 0}
@@ -721,8 +725,13 @@ process_line_count (const struct control *p, uintmax_t repetition)
 
   create_output_file ();
 
-  linenum = get_first_line_in_buffer ();
+  /* Ensure that the line number specified is not 1 greater than
+     the number of lines in the file.
+     When suppressing matched lines, check before the loop. */
+  if (no_more_lines () && suppress_matched)
+    handle_line_error (p, repetition);
 
+  linenum = get_first_line_in_buffer ();
   while (linenum++ < last_line_to_save)
     {
       line = remove_line ();
@@ -733,9 +742,12 @@ process_line_count (const struct control *p, uintmax_t repetition)
 
   close_output_file ();
 
+  if (suppress_matched)
+    line = remove_line ();
+
   /* Ensure that the line number specified is not 1 greater than
      the number of lines in the file. */
-  if (no_more_lines ())
+  if (no_more_lines () && !suppress_matched)
     handle_line_error (p, repetition);
 }
 
@@ -778,6 +790,9 @@ process_regexp (struct control *p, uintmax_t repetition)
   if (!ignore)
     create_output_file ();
 
+  if (suppress_matched && current_line > 0)
+    line = remove_line ();
+
   /* If there is no offset for the regular expression, or
      it is positive, then it is not necessary to buffer the lines. */
 
@@ -1324,9 +1339,10 @@ main (int argc, char **argv)
   control_used = 0;
   suppress_count = false;
   remove_files = true;
+  suppress_matched = false;
   prefix = DEFAULT_PREFIX;
 
-  while ((optc = getopt_long (argc, argv, "f:b:kn:sqz", longopts, NULL)) != -1)
+  while ((optc = getopt_long (argc, argv, "f:b:kmn:sqz", longopts, NULL)) != -1)
     switch (optc)
       {
       case 'f':
@@ -1341,6 +1357,10 @@ main (int argc, char **argv)
         remove_files = false;
         break;
 
+      case 'm':
+        suppress_matched = true;
+        break;
+
       case 'n':
         if (xstrtoul (optarg, NULL, 10, &val, "") != LONGINT_OK
             || MIN (INT_MAX, SIZE_MAX) < val)
@@ -1465,6 +1485,9 @@ and output byte counts of each piece to standard output.\n\
   -k, --keep-files           do not remove output files on errors\n\
 "), stdout);
       fputs (_("\
+  -m, --suppress-matched     suppress the lines matching PATTERN\n\
+"), stdout);
+      fputs (_("\
   -n, --digits=DIGITS        use specified number of digits instead of 2\n\
   -s, --quiet, --silent      do not print counts of output file sizes\n\
   -z, --elide-empty-files    remove empty output files\n\
diff --git a/tests/local.mk b/tests/local.mk
index dc87ef4..e3a72ab 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -260,6 +260,7 @@ all_tests =					\
   tests/misc/csplit.sh				\
   tests/misc/csplit-1000.sh			\
   tests/misc/csplit-heap.sh			\
+  tests/misc/csplit-suppress-matched.pl		\
   tests/misc/date-sec.sh			\
   tests/misc/dircolors.pl			\
   tests/misc/dirname.pl				\
diff --git a/tests/misc/csplit-suppress-matched.pl b/tests/misc/csplit-suppress-matched.pl
new file mode 100644
index 0000000..512bdaa
--- /dev/null
+++ b/tests/misc/csplit-suppress-matched.pl
@@ -0,0 +1,213 @@
+#!/usr/bin/perl
+
+# Copyright (C) 2013 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+use strict;
+use Data::Dumper;
+
+my $limits = getlimits ();
+
+my $prog = 'csplit';
+
+# Turn off localization of executable's output.
+@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
+
+# Input from 'seq 6'
+my $IN_SEQ_6 =<<EOF;
+1
+2
+3
+4
+5
+6
+EOF
+
+# Input from a possible run of 'uniq --group'
+# (groups separated by empty lines)
+my $IN_UNIQ =<<EOF;
+a
+a
+YY
+
+XX
+b
+b
+YY
+
+XX
+c
+YY
+
+XX
+d
+d
+d
+EOF
+
+# Standard Coreotils::run_tests() structure, except the addition of
+# "OUTPUTS" array, containing the expected content of the output files.
+# See code below for conversion into PRE/CMP/POST checks.
+my @csplit_tests =
+(
+  # without suppress-matched,
+  # the newline (matched line) appears in the output files
+  ["re-base", "-q - '/^\$/' '{*}'", {IN_PIPE => $IN_UNIQ},
+    {OUTPUTS => [ "a\na\nYY\n", "\nXX\nb\nb\nYY\n","\nXX\nc\nYY\n",
+                  "\nXX\nd\nd\nd\n" ] }],
+
+  # the newline (matched line) does not appears in the output files
+  ["re-1", "-m -q - '/^\$/' '{*}'", {IN_PIPE => $IN_UNIQ},
+    {OUTPUTS => ["a\na\nYY\n", "XX\nb\nb\nYY\n", "XX\nc\nYY\n",
+                 "XX\nd\nd\nd\n"]}],
+
+  # the 'XX' (matched line + offset 1) does not appears in the output files.
+  # the newline appears in the files (before each split, at the end of the file)
+  ["re-2", "-m -q - '/^\$/1' '{*}'", {IN_PIPE => $IN_UNIQ},
+    {OUTPUTS => ["a\na\nYY\n\n","b\nb\nYY\n\n","c\nYY\n\n","d\nd\nd\n"]}],
+
+  # the 'YY' (matched line + offset of -1) does not appears in the output files
+  # the newline appears in the files (as the first line of the new split)
+  ["re-3", "-m -q - '/^\$/-1' '{*}'", {IN_PIPE => $IN_UNIQ},
+    {OUTPUTS => ["a\na\n", "\nXX\nb\nb\n", "\nXX\nc\n", "\nXX\nd\nd\nd\n"]}],
+
+  # Test two consecutive matched lines
+  # without suppress-matched, the second file should contain a single newline.
+  ["re-4.1", "      -q - '/^\$/' '{*}'", {IN_PIPE => "a\n\n\nb\n"},
+    {OUTPUTS => [ "a\n", "\n", "\nb\n" ]}],
+  # suppress-matched will cause the second file to be empty.
+  ["re-4.2", "   -m -q - '/^\$/' '{*}'", {IN_PIPE => "a\n\n\nb\n"},
+    {OUTPUTS => [ "a\n", "", "b\n" ]}],
+  # suppress-matched + elide-empty should output just two files.
+  ["re-4.3", "-z -m -q - '/^\$/' '{*}'", {IN_PIPE => "a\n\n\nb\n"},
+    {OUTPUTS => [ "a\n", "b\n" ]}],
+
+
+  # Test a matched-line as the last line
+  # default: last file with newline should be created.
+  ["re-5.1", "      -q - '/^\$/' '{*}'", {IN_PIPE => "a\n\nb\n\n"},
+    {OUTPUTS => [ "a\n", "\nb\n", "\n" ]}],
+  # suppress-matched - last empty files should be created.
+  ["re-5.2", "   -m -q - '/^\$/' '{*}'", {IN_PIPE => "a\n\nb\n\n"},
+    {OUTPUTS => [ "a\n", "b\n", "" ]}],
+  # suppress-matched + elide-empty: just two files should be created.
+  ["re-5.3", "-z -m -q - '/^\$/' '{*}'", {IN_PIPE => "a\n\nb\n\n"},
+    {OUTPUTS => [ "a\n", "b\n" ]}],
+
+  # without suppress-matched,
+  # the matched lines (2/4/6) appears in the output files
+  ["int-base",    '-q - 2 4 6', {IN_PIPE => $IN_SEQ_6},
+    {OUTPUTS => [ "1\n", "2\n3\n", "4\n5\n", "6\n" ]}],
+  # suppress matched - the matching lines (2/4/6) should not appear.
+  ["int-1", '   -m -q - 2 4 6', {IN_PIPE => $IN_SEQ_6},
+    {OUTPUTS => [ "1\n", "3\n", "5\n", "" ]}],
+  # suppress matched + elide-empty
+  ["int-2", '-z -m -q - 2 4 6', {IN_PIPE => $IN_SEQ_6},
+    {OUTPUTS => [ "1\n", "3\n", "5\n" ]}],
+);
+
+
+
+=pod
+The following loop translate the above @Tests to a Cureutils::run_tests()
+compatible structure. It converts "OUTPUTS" key into "CMP" + "POST" keys:
+1. Each element in the OUTPUTS key is expected to be an output file
+   from csplit (named xx00, xx01, xx02...)
+   create a "CMP" key for each one, with the output and the filename.
+2. Add a "POST" key, ensuring no extra files have been created.
+   (e.g. if there are 4 expected outputs, xx00 to xx03,
+    ensure xx04 doesn't exist).
+3. Add a "PRE" key, deleting all existing 'xx*' files.
+
+Example:
+
+Before conversion:
+   my @csplit_tests =
+   (
+     ["1", '-z -q - 2 4 6',
+       {IN_PIPE => "1\n2\n3\n4\n5\n6\n"},
+       {OUTPUTS => [ "1\n", "2\n3\n", "4\n5\n", "6\n" ],
+     ]
+   )
+
+After conversion:
+
+   my @csplit_tests =
+   (
+     ["1", '-z -q - 2 4 6',
+       {IN_PIPE => "1\n2\n3\n4\n5\n6\n"},
+       {PRE => sub { unlink glob './xx??' ; }},
+       {CMP => ["1\n",    {'xx00'=> undef}]},
+       {CMP => ["2\n3\n", {'xx01'=> undef}]},
+       {CMP => ["4\n5\n", {'xx02'=> undef}]},
+       {CMP => ["6\n",    {'xx03'=> undef}]},
+       {POST => sub { die "extra file" if -e 'xx04'}},
+     ],
+    );
+=cut
+my @Tests;
+foreach my $t (@csplit_tests)
+  {
+    my ($test_name, $cmdline, @others) = @$t;
+    my $new_ent = [$test_name, $cmdline];
+
+    my $out_file_num = 0 ;
+
+    foreach my $e (@others)
+      {
+        die "Internal error: expecting a hash (e.g. IN_PIPE/OUTPUTS/ERR)" .
+            "in test '$test_name', got $e"
+            unless ref $e && (ref $e eq 'HASH');
+
+        my ($key, $value) = each %$e;
+        if ($key eq 'OUTPUTS')
+          {
+            # Convert each expected OUTPUT to a 'CMP' key.
+            foreach my $output (@$value)
+              {
+                my $filename = sprintf("xx%02d",$out_file_num++);
+                my $cmp = {CMP => [ $output, { $filename => undef}]};
+                push $new_ent, $cmp;
+              }
+
+            # Add a 'POST' check
+            # Ensure no extra files have been created.
+            my $filename = sprintf("xx%02d",$out_file_num++);
+            my $post = { POST => sub { die "Test failed: an extraneous file " .
+                                "'$filename' has been created\n"
+                                if -e $filename; } } ;
+            push $new_ent, $post;
+
+            # before running each test, cleanup the 'xx00' files
+            # from previous runs.
+            my $pre = { PRE => sub { unlink glob "./xx??"; } };
+            push $new_ent, $pre;
+          }
+        else
+          {
+            # pass other entities as-is (e.g. OUT, ERR, OUT_SUBST, EXIT)
+            # run_tests() will know how to handle them.
+            push $new_ent, $e;
+          }
+      }
+
+    push @Tests, $new_ent;
+  }
+
+my $save_temps = $ENV{DEBUG};
+my $verbose = $ENV{VERBOSE};
+
+my $fail = run_tests ($prog, $prog, \@Tests, $save_temps, $verbose);
+exit $fail;
-- 
1.7.7.4

Re: [PATCH] csplit: new option --suppress-matched

Reply via email to