[PATCH] csplit: new option --suppress-matched

Assaf Gordon Thu, 07 Mar 2013 14:39:51 -0800

Hello,

Attached is a new option for csplit, suppress-matched, as been mentioned few 
times before (e.g. 
http://lists.gnu.org/archive/html/coreutils/2013-02/msg00170.html ).


It works well for REGEXP patterns, but there's a bug with INTEGER patterns that 
I haven't been able to pinpoint yet (suggestions are welcomed).

Regards,
  -gordon

>From 49f43214ebfa41fa1f67e7001d8467288ff34837 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Wed, 6 Mar 2013 15:53:16 -0500
Subject: [PATCH] csplit: new option, --suppress-matched

FIXME: Currently works only with REGEXP patterns.

With --suppress-matched, the lines that match the pattern will not be
printed in the output files.

* src/csplit.c: implement --suppress-matched.
process_regexp(),process_line_count(): skip the matched lined without
printing. Since csplit always does "up to but not including" matched
lines, the first line (in the next group) is the matched line - just
skip it.
main(): handle new option.
usage(): mention new option.
* NEWS: mention new option.
* doc/coreutils.texi: mention new option, add examples.
* tests/misc/csplit-supress-matched.sh: test new option.
* tests/local.mk: add new test script.
---
 NEWS                                  |    3 +
 doc/coreutils.texi                    |   25 ++++
 src/csplit.c                          |   26 ++++-
 tests/local.mk                        |    1 +
 tests/misc/csplit-suppress-matched.sh |  233 +++++++++++++++++++++++++++++++++
 5 files changed, 287 insertions(+), 1 deletions(-)
 create mode 100755 tests/misc/csplit-suppress-matched.sh

diff --git a/NEWS b/NEWS
index 5b28c92..2385be7 100644
--- a/NEWS
+++ b/NEWS
@@ -18,6 +18,9 @@ GNU coreutils NEWS                                    -*- outline -*-
   uniq accepts a new option: --group to print all items, while separating
   unique groups with empty lines.
 
+  csplit accepts a new option: --suppressed-matched (-m). Lines matching
+  the specified patterns will not be printed.
+
 
 * Noteworthy changes in release 8.21 (2013-02-14) [stable]
 
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index fe4c3ad..4f7da4c 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3608,6 +3608,12 @@ long instead of the default 2.
 @opindex --keep-files
 Do not remove output files when errors are encountered.
 
+@item -m
+@itemx --suppress-matched
+@opindex -m
+@opindex --suppress-matched
+Do not output lines matching the specified @var{pattern}.
+
 @item -z
 @itemx --elide-empty-files
 @opindex -z
@@ -3684,6 +3690,25 @@ $ head xx*
 14
 @end example
 
+Example of splitting input by empty lines:
+
+@example
+$ csplit --suppress-matched @var{input.txt} '/^$/' '@{*@}'
+@end example
+
+@c
+@c TODO: "uniq" already supportes "--group".
+@c        when it gets the "--key" option, uncomment this example.
+@c
+@c Example of splitting input file, based on the value of column 2:
+@c
+@c @example
+@c $ cat @var{input.txt} |
+@c       sort -k2,2 |
+@c       uniq --group -k2,2 |
+@c       csplit -m '/^$/' '@{*@}'
+@c @end example
+
 @node Summarizing files
 @chapter Summarizing files
 
diff --git a/src/csplit.c b/src/csplit.c
index 22f3ad4..664b567 100644
--- a/src/csplit.c
+++ b/src/csplit.c
@@ -166,6 +166,9 @@ static bool volatile remove_files;
 /* If true, remove all output files which have a zero length. */
 static bool elide_empty_files;
 
+/* If true, supress the lines that match the PATTERN */
+static bool suppress_matched;
+
 /* The compiled pattern arguments, which determine how to split
    the input file. */
 static struct control *controls;
@@ -185,6 +188,7 @@ static struct option const longopts[] =
   {"elide-empty-files", no_argument, NULL, 'z'},
   {"prefix", required_argument, NULL, 'f'},
   {"suffix-format", required_argument, NULL, 'b'},
+  {"suppress-matched", no_argument, NULL, 'm'},
   {GETOPT_HELP_OPTION_DECL},
   {GETOPT_VERSION_OPTION_DECL},
   {NULL, 0, NULL, 0}
@@ -721,6 +725,15 @@ process_line_count (const struct control *p, uintmax_t repetition)
 
   create_output_file ();
 
+#if 0
+  /* FIXME: this doesn't work when the last line is the matched line
+   * e.g.:
+   *   $ seq 1 6 | ./src/csplit -m - 2 4 6
+   */
+  if (suppress_matched)
+    line = remove_line ();
+#endif
+
   linenum = get_first_line_in_buffer ();
 
   while (linenum++ < last_line_to_save)
@@ -778,6 +791,9 @@ process_regexp (struct control *p, uintmax_t repetition)
   if (!ignore)
     create_output_file ();
 
+  if (suppress_matched && current_line > 0)
+    line = remove_line ();
+
   /* If there is no offset for the regular expression, or
      it is positive, then it is not necessary to buffer the lines. */
 
@@ -1324,9 +1340,10 @@ main (int argc, char **argv)
   control_used = 0;
   suppress_count = false;
   remove_files = true;
+  suppress_matched = false;
   prefix = DEFAULT_PREFIX;
 
-  while ((optc = getopt_long (argc, argv, "f:b:kn:sqz", longopts, NULL)) != -1)
+  while ((optc = getopt_long (argc, argv, "f:b:kmn:sqz", longopts, NULL)) != -1)
     switch (optc)
       {
       case 'f':
@@ -1341,6 +1358,10 @@ main (int argc, char **argv)
         remove_files = false;
         break;
 
+      case 'm':
+        suppress_matched = true;
+        break;
+
       case 'n':
         if (xstrtoul (optarg, NULL, 10, &val, "") != LONGINT_OK
             || MIN (INT_MAX, SIZE_MAX) < val)
@@ -1465,6 +1486,9 @@ and output byte counts of each piece to standard output.\n\
   -k, --keep-files           do not remove output files on errors\n\
 "), stdout);
       fputs (_("\
+  -m, --suppress-matched     suppress the lines matching PATTERN\n\
+"), stdout);
+      fputs (_("\
   -n, --digits=DIGITS        use specified number of digits instead of 2\n\
   -s, --quiet, --silent      do not print counts of output file sizes\n\
   -z, --elide-empty-files    remove empty output files\n\
diff --git a/tests/local.mk b/tests/local.mk
index 607ddc4..fc53c75 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -260,6 +260,7 @@ all_tests =					\
   tests/misc/csplit.sh				\
   tests/misc/csplit-1000.sh			\
   tests/misc/csplit-heap.sh			\
+  tests/misc/csplit-suppress-matched.sh		\
   tests/misc/date-sec.sh			\
   tests/misc/dircolors.pl			\
   tests/misc/dirname.pl				\
diff --git a/tests/misc/csplit-suppress-matched.sh b/tests/misc/csplit-suppress-matched.sh
new file mode 100755
index 0000000..070284a
--- /dev/null
+++ b/tests/misc/csplit-suppress-matched.sh
@@ -0,0 +1,233 @@
+#!/bin/sh
+# Test csplit's --suppress-matched option
+
+# Copyright (C) 2013 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ csplit
+
+printf "%s\n" a a YY '' XX b b YY '' \
+              XX c YY '' XX d d d > in1 || framework_failure_
+
+# Expected output of test 1:
+#  the newline (matched line) appears in the output file (exp2/3/4)
+printf "a\na\nYY\n"          > test1_exp0 || framework_failure_
+printf "\nXX\nb\nb\nYY\n"    > test1_exp1 || framework_failure_
+printf "\nXX\nc\nYY\n"       > test1_exp2 || framework_failure_
+printf "\nXX\nd\nd\nd\n"     > test1_exp3 || framework_failure_
+
+# Expected output of test 2:
+#  the newline (matched line) does not appears in the output files
+printf "a\na\nYY\n"        > test2_exp0 || framework_failure_
+printf "XX\nb\nb\nYY\n"    > test2_exp1 || framework_failure_
+printf "XX\nc\nYY\n"       > test2_exp2 || framework_failure_
+printf "XX\nd\nd\nd\n"     > test2_exp3 || framework_failure_
+
+# Expected output of test 3:
+#  the XX (matched line + offset 1) does not appears in the output files.
+#  the newline appears in the files (before each split, at the end of the file)
+printf "a\na\nYY\n\n"    > test3_exp0 || framework_failure_
+printf "b\nb\nYY\n\n"    > test3_exp1 || framework_failure_
+printf "c\nYY\n\n"       > test3_exp2 || framework_failure_
+printf "d\nd\nd\n"       > test3_exp3 || framework_failure_
+
+# Expected output of test 4:
+#  the YY (matched line + offset of -1) does not appears in the output files
+#  the newline appears in the files (as the first line of the new split)
+printf "a\na\n"          > test4_exp0 || framework_failure_
+printf "\nXX\nb\nb\n"    > test4_exp1 || framework_failure_
+printf "\nXX\nc\n"       > test4_exp2 || framework_failure_
+printf "\nXX\nd\nd\nd\n" > test4_exp3 || framework_failure_
+
+seq 1 6 > in2 || framework_failure_
+
+# Expected output of test 5
+#  The matched lines (2/4/6) appear as the first line of new files.
+printf "1\n"             > test5_exp0 || framework_failure_
+printf "2\n3\n"          > test5_exp1 || framework_failure_
+printf "4\n5\n"          > test5_exp2 || framework_failure_
+printf "6\n"             > test5_exp3 || framework_failure_
+
+# Expected output of test 6
+#  The matched lines (2/4/6) are not present
+printf "1\n"          > test6_exp0 || framework_failure_
+printf "3\n"          > test6_exp1 || framework_failure_
+printf "5\n"          > test6_exp2 || framework_failure_
+
+
+# Test two consecutive matched lines
+printf "%s\n" a '' '' b > in3 || framework_failure_
+
+# Expected output of test 7:
+#  suppress-matched will cause the second group to be an empty file.
+#  (without --suppress-matched it should contain a single newline)
+printf "a\n" > test7_exp0 || framework_failure_
+printf ""    > test7_exp1 || framework_failure_
+printf "b\n" > test7_exp2 || framework_failure_
+
+# Expected output of test 8:
+#  suppress-matched + elide-empty-files
+#  should create just two files
+printf "a\n" > test8_exp0 || framework_failure_
+printf "b\n" > test8_exp1 || framework_failure_
+
+
+# A matched-line as the last line
+printf "%s\n" a '' b '' > in4 || framework_failure_
+
+# Expected output of test 9:
+#  suppress-matched should create just three files
+#  (as the last line which matched should be suppressed, but still start a
+#   new file)
+printf "a\n" > test9_exp0 || framework_failure_
+printf "b\n" > test9_exp1 || framework_failure_
+printf ""    > test9_exp2 || framework_failure_
+
+# Expected output of test 10:
+#  suppress-matched + elide-empty-files should create just two files
+printf "a\n" > test10_exp0 || framework_failure_
+printf "b\n" > test10_exp1 || framework_failure_
+
+
+
+##
+## Test 1:
+##    regexp baseline without --suppress-matched
+##
+csplit --prefix=t1_ in1 '/^$/' '{*}' > /dev/null || fail=1
+for i in 0 1 2 3 ;
+do
+  compare test1_exp$i t1_0$i || { fail=1 ; echo "test1_exp$i failed" 1>&2 ; }
+done
+
+##
+## Test 2:
+##    suppress-matched + regexp
+##
+csplit --prefix=t2_ --suppress-matched \
+       in1 '/^$/' '{*}' > /dev/null || fail=1
+for i in 0 1 2 3 ;
+do
+  compare test2_exp$i t2_0$i || { fail=1 ; echo "test2_exp$i failed" 1>&2 ; }
+done
+
+##
+## Test 3:
+##    suppress-matched + regexp + offset=1
+##
+csplit --prefix=t3_ --suppress-matched \
+       in1 '/^$/1' '{*}' > /dev/null || fail=1
+for i in 0 1 2 3 ;
+do
+  compare test3_exp$i t3_0$i || { fail=1 ; echo "test3_exp$i failed" 1>&2 ; }
+done
+
+##
+## Test 4:
+##    suppress-matched + regexp + offset=-1
+##
+csplit --prefix=t4_ --suppress-matched \
+       in1 '/^$/-1' '{*}' > /dev/null || fail=1
+for i in 0 1 2 3 ;
+do
+  compare test4_exp$i t4_0$i || { fail=1 ; echo "test4_exp$i failed" 1>&2 ; }
+done
+
+##
+## Test 5:
+##    INTEGER baseline without --suppress-matched
+##
+csplit --prefix=t5_ in2 2 4 6 > /dev/null || fail=1
+for i in 0 1 2 3 ;
+do
+  compare test5_exp$i t5_0$i || { fail=1 ; echo "test5_exp$i failed" 1>&2 ; }
+done
+
+##
+## TODO: FIX BUG when last line is the matched line
+##
+if false ; then
+  ##
+  ## Test 6:
+  ##    INTEGER with --suppress-matched
+  ##
+  csplit --suppress-matched --prefix=t6_ in2 2 4 6 > /dev/null || fail=1
+  for i in 0 1 2 ;
+  do
+    compare test6_exp$i t6_0$i || { fail=1 ; echo "test6_exp$i failed" 1>&2 ; }
+  done
+  #Extra check: the last file (containing only "6") should not be created at all
+  test -e t6_03 && { fail=1 ; echo "test6_exp3 - failed" 1>&2 ; }
+fi
+
+
+##
+## Test 7:
+##    suppress-matched + two consecutive groups
+##
+csplit --prefix=t7_ --suppress-matched \
+       in3 '/^$/' '{*}' > /dev/null || fail=1
+for i in 0 1 2 ;
+do
+  compare test7_exp$i t7_0$i || { fail=1 ; echo "test7_exp$i failed" 1>&2 ; }
+done
+
+##
+## Test 8:
+##    suppress-matched + two consecutive groups + elide-empty-files
+##
+csplit --prefix=t8_ --elide-empty-files --suppress-matched \
+       in3 '/^$/' '{*}' > /dev/null || fail=1
+for i in 0 1 ;
+do
+  compare test8_exp$i t8_0$i || { fail=1 ; echo "test8_exp$i failed" 1>&2 ; }
+done
+#Extra check: there should not be a third file
+test -e t8_02 && { fail=1 ; echo "test8_exp2 - failed" 1>&2 ; }
+
+
+##
+## Test 9:
+##    suppress-matched + matched-line as last line
+##
+csplit --prefix=t9_ --suppress-matched \
+       in4 '/^$/' '{*}' > /dev/null || fail=1
+for i in 0 1 2 ;
+do
+  compare test9_exp$i t9_0$i || { fail=1 ; echo "test9_exp$i failed" 1>&2 ; }
+done
+
+##
+## Test 10:
+##    suppress-matched + matched last line + elide-empty-files
+##
+csplit --prefix=t10_ --elide-empty-files --suppress-matched \
+       in4 '/^$/' '{*}' > /dev/null || fail=1
+for i in 0 1 ;
+do
+  compare test10_exp$i t10_0$i || { fail=1 ; echo "test10_exp$i failed" 1>&2 ; }
+done
+#Extra check: there should not be a third file
+test -e t10_02 && { fail=1 ; echo "test10_exp2 - failed" 1>&2 ; }
+
+
+
+
+
+
+
+
+Exit $fail
-- 
1.7.7.4

[PATCH] csplit: new option --suppress-matched

Reply via email to