[PATCH]: uniq: add "--group" option

Assaf Gordon Wed, 20 Feb 2013 10:44:33 -0800

Hello,

Attached is a suggestion for "--group" option in uniq, as discussed here:
  http://lists.gnu.org/archive/html/coreutils/2011-03/msg00000.html
  http://lists.gnu.org/archive/html/coreutils/2012-03/msg00052.html


The patch adds two parameters:
      --group=[method]  separate each unique line (whether duplicated or not)
                        with a marker.
                        method={none,separate(default),prepend,append,both)
      --group-separator=SEP   with --group, separates group using SEP
                        (default: empty line)

And it behaves "as expected":
===
$ printf "a\na\na\nb\nc\nc\n" | ./src/uniq --group-sep="--" --group=separate
a
--
b
--
c

$ printf "a\na\na\nb\nc\nc\n" | ./src/uniq --group-sep="--" --group=both
--
a
--
b
--
c
--

$ printf "a\na\na\nb\nc\nc\n" | ./src/uniq --group-sep="--" --group=prepend
--
a
--
b
--
c

$ printf "a\na\na\nb\nc\nc\n" | ./src/uniq -D --group-sep="--" --group=both
--
a
a
a
--
c
c
--

===

The added tests check all sorts of combinations.

If this is the right direction, I'll send an updated patch (with 
NEWS/docs/etc.).

-gordon

>From ece4bcc78d23050da1572ddec29ed81a806cbf4b Mon Sep 17 00:00:00 2001
From: Assaf Gordon <[email protected]>
Date: Wed, 20 Feb 2013 13:31:22 -0500
Subject: [PATCH] uniq: add "--group" option

* src/uniq.c: implement "--group" options.
* tests/misc/uniq.pl: add tests.
---
 src/uniq.c         |   96 +++++++++++++++++++++++++++++++++++++++++++++++++--
 tests/misc/uniq.pl |   33 ++++++++++++++++++
 2 files changed, 125 insertions(+), 4 deletions(-)

diff --git a/src/uniq.c b/src/uniq.c
index 5efdad7..c41e25d 100644
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -83,6 +83,10 @@ static bool output_later_repeated;
 /* If true, ignore case when comparing.  */
 static bool ignore_case;
 
+/* If true, at least one line was printed to output
+   in writeline(). used with grouping */
+static bool first_group_printed = false;
+
 enum delimit_method
 {
   /* No delimiters output.  --all-repeated[=none] */
@@ -108,11 +112,53 @@ static enum delimit_method const delimit_method_map[] =
 /* Select whether/how to delimit groups of duplicate lines.  */
 static enum delimit_method delimit_groups;
 
+enum grouping_method
+{
+  /* No grouping.     --group=none */
+  GM_NONE,
+
+  /* Delimiter preceges all groups.  --group=prepend */
+  GM_PREPEND,
+
+  /* Delimiter follows all groups.   --group=append */
+  GM_APPEND,
+
+  /* Delimiter between groups.    --group[=separate] */
+  GM_SEPARATE,
+
+  /* Delimiter before and after each group. --group=both */
+  GM_BOTH
+};
+
+static char const *const grouping_method_string[] =
+{
+  "none", "prepend", "append", "separate", "both", NULL
+};
+
+static enum grouping_method const grouping_method_map[] =
+{
+  GM_NONE, GM_PREPEND, GM_APPEND, GM_SEPARATE, GM_BOTH
+};
+
+static enum grouping_method grouping;
+
+/* Default group separator - empty line.
+   The end-of-line delimiter will be appended (newline or NULL) */
+static const char* grouping_delimiter = "";
+
+enum
+{
+  GROUP_OPTION = CHAR_MAX + 1,
+  GROUP_SEPARATOR_OPTION,
+};
+
 static struct option const longopts[] =
 {
   {"count", no_argument, NULL, 'c'},
   {"repeated", no_argument, NULL, 'd'},
   {"all-repeated", optional_argument, NULL, 'D'},
+  {"group", optional_argument, NULL, GROUP_OPTION},
+  {"group-separator", required_argument, NULL, GROUP_SEPARATOR_OPTION},
   {"ignore-case", no_argument, NULL, 'i'},
   {"unique", no_argument, NULL, 'u'},
   {"skip-fields", required_argument, NULL, 'f'},
@@ -159,6 +205,13 @@ With no options, matching lines are merged to the first occurrence.\n\
   -z, --zero-terminated  end lines with 0 byte, not newline\n\
 "), stdout);
      fputs (_("\
+      --group=[method]  separate each unique line (whether duplicated or not)\n\
+                        with a marker.\n\
+                        method={none,separate(default),prepend,append,both)\n\
+      --group-separator=SEP   with --group, separates group using SEP\n\
+                        (default: empty line)\n\
+"), stdout);
+     fputs (_("\
   -w, --check-chars=N   compare no more than N characters in lines\n\
 "), stdout);
      fputs (HELP_OPTION_DESCRIPTION, stdout);
@@ -257,17 +310,32 @@ different (char *old, char *new, size_t oldlen, size_t newlen)
 
 static void
 writeline (struct linebuffer const *line,
-           bool match, uintmax_t linecount)
+           bool match, uintmax_t linecount, char delimiter)
 {
   if (! (linecount == 0 ? output_unique
          : !match ? output_first_repeated
          : output_later_repeated))
     return;
 
+  if ( ( !output_later_repeated || (match && linecount==1) )
+      && (grouping == GM_PREPEND || grouping == GM_BOTH
+          || ( grouping == GM_SEPARATE && first_group_printed )))
+    {
+      fputs (grouping_delimiter, stdout);
+      putchar (delimiter);
+    }
+  first_group_printed = true;
+
   if (countmode == count_occurrences)
     printf ("%7" PRIuMAX " ", linecount + 1);
 
   fwrite (line->buffer, sizeof (char), line->length, stdout);
+
+  if (!match && grouping == GM_APPEND)
+    {
+      fputs (grouping_delimiter, stdout);
+      putchar (delimiter);
+    }
 }
 
 /* Process input file INFILE with output to OUTFILE.
@@ -299,7 +367,8 @@ check_file (const char *infile, const char *outfile, char delimiter)
      this optimization lets uniq output each different line right away,
      without waiting to see if the next one is different.  */
 
-  if (output_unique && output_first_repeated && countmode == count_none)
+  if (output_unique && output_first_repeated && countmode == count_none
+      && grouping == GM_NONE)
     {
       char *prevfield IF_LINT ( = NULL);
       size_t prevlen IF_LINT ( = 0);
@@ -377,7 +446,7 @@ check_file (const char *infile, const char *outfile, char delimiter)
 
           if (!match || output_later_repeated)
             {
-              writeline (prevline, match, match_count);
+              writeline (prevline, match, match_count, delimiter);
               SWAP_LINES (prevline, thisline);
               prevfield = thisfield;
               prevlen = thislen;
@@ -386,13 +455,19 @@ check_file (const char *infile, const char *outfile, char delimiter)
             }
         }
 
-      writeline (prevline, false, match_count);
+      writeline (prevline, false, match_count, delimiter);
     }
 
  closefiles:
   if (ferror (stdin) || fclose (stdin) != 0)
     error (EXIT_FAILURE, 0, _("error reading %s"), infile);
 
+  if (grouping == GM_BOTH && first_group_printed)
+    {
+      fputs (grouping_delimiter, stdout);
+      putchar (delimiter);
+    }
+
   /* stdout is handled via the atexit-invoked close_stdout function.  */
 
   free (lb1.buffer);
@@ -515,6 +590,19 @@ main (int argc, char **argv)
                                         delimit_method_map);
           break;
 
+        case GROUP_OPTION:
+          if (optarg == NULL)
+            grouping = GM_SEPARATE;
+          else
+            grouping = XARGMATCH ("--group", optarg,
+                                  grouping_method_string,
+                                  grouping_method_map);
+          break;
+
+        case GROUP_SEPARATOR_OPTION:
+          grouping_delimiter = optarg;
+          break;
+
         case 'f':
           skip_field_option_type = SFO_NEW;
           skip_fields = size_opt (optarg,
diff --git a/tests/misc/uniq.pl b/tests/misc/uniq.pl
index 140a49b..a6ce93d 100755
--- a/tests/misc/uniq.pl
+++ b/tests/misc/uniq.pl
@@ -199,6 +199,39 @@ my @Tests =
  # Check that --zero-terminated is synonymous with -z.
  ['123', '--zero-terminated', {IN=>"a\na\nb"}, {OUT=>"a\na\nb\0"}],
  ['124', '--zero-terminated', {IN=>"a\0a\0b"}, {OUT=>"a\0b\0"}],
+
+ # Check grouping
+ ['125', '--group=prepend', {IN=>"a\na\nb\n"}, {OUT=>"\na\n\nb\n"}],
+ ['126', '--group=append',  {IN=>"a\na\nb\n"}, {OUT=>"a\n\nb\n\n"}],
+ ['127', '--group=separate',{IN=>"a\na\nb\n"}, {OUT=>"a\n\nb\n"}],
+ # no explicit grouping = separate
+ ['128', '--group',         {IN=>"a\na\nb\n"}, {OUT=>"a\n\nb\n"}],
+ ['129', '--group=none',    {IN=>"a\na\nb\n"}, {OUT=>"a\nb\n"}],
+ ['130', '--group=both',    {IN=>"a\na\nb\n"}, {OUT=>"\na\n\nb\n\n"}],
+ # Grouping in the special case of a single group
+ ['131', '--group=prepend', {IN=>"a\na\n"}, {OUT=>"\na\n"}],
+ ['132', '--group=append',  {IN=>"a\na\n"}, {OUT=>"a\n\n"}],
+ ['133', '--group=separate',{IN=>"a\na\n"}, {OUT=>"a\n"}],
+ ['134', '--group',         {IN=>"a\na\n"}, {OUT=>"a\n"}],
+ ['135', '--group=both',    {IN=>"a\na\n"}, {OUT=>"\na\n\n"}],
+ ['136', '--group=none',    {IN=>"a\na\n"}, {OUT=>"a\n"}],
+ # Grouping with "-D" (=show duplicated lines)
+ ['137', '--group=prepend -D' , {IN=>"a\na\na\n"}, {OUT=>"\na\na\na\n"}],
+ ['138', '--group=append -D',   {IN=>"a\na\na\n"}, {OUT=>"a\na\na\n\n"}],
+ ['139', '--group=separate -D',  {IN=>"a\na\na\n"}, {OUT=>"a\na\na\n"}],
+ ['140', '--group=separate -D',  {IN=>"a\na\na\nb\nb\nb\n"},
+         {OUT=>"a\na\na\n\nb\nb\nb\n"}],
+ ['141', '--group=both -D',  {IN=>"a\na\na\nb\nb\nb\n"},
+         {OUT=>"\na\na\na\n\nb\nb\nb\n\n"}],
+ # Grouping with custom group delimiter
+ ['142', '--group=both --group-sep="-=-"', {IN=>"a\na\nb\n"},
+         {OUT=>"-=-\na\n-=-\nb\n-=-\n"}],
+ # Grouping with empty input - should never print anything
+ ['143', '--group=prepend',  {IN=>""}, {OUT=>""}],
+ ['144', '--group=append',   {IN=>""}, {OUT=>""}],
+ ['145', '--group=separate', {IN=>""}, {OUT=>""}],
+ ['146', '--group=none',     {IN=>""}, {OUT=>""}],
+ ['147', '--group=both',     {IN=>""}, {OUT=>""}],
 );
 
 # Set _POSIX2_VERSION=199209 in the environment of each obs-plus* test.
-- 
1.7.7.4

[PATCH]: uniq: add "--group" option

Reply via email to