Re: [PATCH]: uniq: add "--group" option

Assaf Gordon Thu, 21 Feb 2013 11:40:28 -0800

Assaf Gordon wrote, On 02/21/2013 11:37 AM:
> 
> You were planning on "--group" to mean explicitly "output all input lines, 
> and add group-markers for unique groups" (meaning -u/-d/-D and --group are 
> mutually exclusive).
>


Attached is a version that behaves as previously discussed.
"--group" can't be used with -c/-d/-D/-u.

Since it's a completely separate behavior, I found it easier to create a whole 
new code path in "check_file()" for the special case of grouping.

Comments are welcomed,
 -gordon

>From 072ffee0f45a67465607cde3d984e6fd7e37a1af Mon Sep 17 00:00:00 2001
From: Assaf Gordon <[email protected]>
Date: Wed, 20 Feb 2013 13:31:22 -0500
Subject: [PATCH] uniq: add "--group" option

* src/uniq.c: implement "--group" options.
* tests/misc/uniq.pl: add tests.
---
 src/uniq.c         |  125 +++++++++++++++++++++++++++++++++++++++++++++++++---
 tests/misc/uniq.pl |   40 +++++++++++++++++
 2 files changed, 159 insertions(+), 6 deletions(-)

diff --git a/src/uniq.c b/src/uniq.c
index 5efdad7..598c62d 100644
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -108,11 +108,47 @@ static enum delimit_method const delimit_method_map[] =
 /* Select whether/how to delimit groups of duplicate lines.  */
 static enum delimit_method delimit_groups;
 
+enum grouping_method
+{
+  /* No grouping, when "--group" isn't used */
+  GM_NONE,
+
+  /* Delimiter preceges all groups.  --group=prepend */
+  GM_PREPEND,
+
+  /* Delimiter follows all groups.   --group=append */
+  GM_APPEND,
+
+  /* Delimiter between groups.    --group[=separate] */
+  GM_SEPARATE,
+
+  /* Delimiter before and after each group. --group=both */
+  GM_BOTH
+};
+
+static char const *const grouping_method_string[] =
+{
+  "prepend", "append", "separate", "both", NULL
+};
+
+static enum grouping_method const grouping_method_map[] =
+{
+  GM_PREPEND, GM_APPEND, GM_SEPARATE, GM_BOTH
+};
+
+static enum grouping_method grouping = GM_NONE;
+
+enum
+{
+  GROUP_OPTION = CHAR_MAX + 1
+};
+
 static struct option const longopts[] =
 {
   {"count", no_argument, NULL, 'c'},
   {"repeated", no_argument, NULL, 'd'},
   {"all-repeated", optional_argument, NULL, 'D'},
+  {"group", optional_argument, NULL, GROUP_OPTION},
   {"ignore-case", no_argument, NULL, 'i'},
   {"unique", no_argument, NULL, 'u'},
   {"skip-fields", required_argument, NULL, 'f'},
@@ -159,6 +195,11 @@ With no options, matching lines are merged to the first occurrence.\n\
   -z, --zero-terminated  end lines with 0 byte, not newline\n\
 "), stdout);
      fputs (_("\
+      --group=[method]  separate each unique group (whether duplicated or not)\n\
+                        with an empty line.\n\
+                        method={separate(default),prepend,append,both)\n\
+"), stdout);
+     fputs (_("\
   -w, --check-chars=N   compare no more than N characters in lines\n\
 "), stdout);
      fputs (HELP_OPTION_DESCRIPTION, stdout);
@@ -293,13 +334,57 @@ check_file (const char *infile, const char *outfile, char delimiter)
   initbuffer (prevline);
 
   /* The duplication in the following 'if' and 'else' blocks is an
-     optimization to distinguish the common case (in which none of
-     the following options has been specified: --count, -repeated,
-     --all-repeated, --unique) from the others.  In the common case,
-     this optimization lets uniq output each different line right away,
-     without waiting to see if the next one is different.  */
+     optimization to distinguish several cases:
 
-  if (output_unique && output_first_repeated && countmode == count_none)
+     1. grouping (--group=X) - all input lines are printed.
+        checking for unique/duplicated lines is used only for printing
+        group separators.
+
+     2. The common case -
+        In which none of the following options has been specified:
+          --count, --repeated,  --all-repeated, --unique
+        In the common case, this optimization lets uniq output each different
+        line right away, without waiting to see if the next one is different.
+
+     3. All other cases.
+  */
+  if (grouping != GM_NONE)
+    {
+      char *prevfield IF_LINT ( = NULL);
+      size_t prevlen IF_LINT ( = 0);
+      bool first_group_printed = false;
+
+      while (!feof (stdin))
+        {
+          char *thisfield;
+          size_t thislen;
+          bool new_group;
+          if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
+            break;
+          thisfield = find_field (thisline);
+          thislen = thisline->length - 1 - (thisfield - thisline->buffer);
+
+          new_group = (prevline->length == 0
+                       || different (thisfield, prevfield, thislen, prevlen));
+
+          if (new_group
+              && ( (grouping == GM_PREPEND) || (grouping == GM_BOTH)
+                   || ( first_group_printed
+                        &&
+                        ( grouping == GM_APPEND || grouping == GM_SEPARATE ))))
+            putchar (delimiter);
+
+          fwrite (thisline->buffer, sizeof (char), thisline->length, stdout);
+          SWAP_LINES (prevline, thisline);
+          prevfield = thisfield;
+          prevlen = thislen;
+          first_group_printed = true;
+        }
+      if ( (grouping == GM_BOTH || grouping == GM_APPEND )
+           && first_group_printed)
+        putchar (delimiter);
+    }
+  else if (output_unique && output_first_repeated && countmode == count_none)
     {
       char *prevfield IF_LINT ( = NULL);
       size_t prevlen IF_LINT ( = 0);
@@ -415,6 +500,7 @@ main (int argc, char **argv)
   int nfiles = 0;
   char const *file[2];
   char delimiter = '\n';	/* change with --zero-terminated, -z */
+  bool output_option_used = false;   /* if true, one of -u/-d/-D/-c was used */
 
   file[0] = file[1] = "-";
   initialize_main (&argc, &argv);
@@ -498,10 +584,12 @@ main (int argc, char **argv)
 
         case 'c':
           countmode = count_occurrences;
+          output_option_used = true;
           break;
 
         case 'd':
           output_unique = false;
+          output_option_used = true;
           break;
 
         case 'D':
@@ -513,6 +601,16 @@ main (int argc, char **argv)
             delimit_groups = XARGMATCH ("--all-repeated", optarg,
                                         delimit_method_string,
                                         delimit_method_map);
+          output_option_used = true;
+          break;
+
+        case GROUP_OPTION:
+          if (optarg == NULL)
+            grouping = GM_SEPARATE;
+          else
+            grouping = XARGMATCH ("--group", optarg,
+                                  grouping_method_string,
+                                  grouping_method_map);
           break;
 
         case 'f':
@@ -532,6 +630,7 @@ main (int argc, char **argv)
 
         case 'u':
           output_first_repeated = false;
+          output_option_used = true;
           break;
 
         case 'w':
@@ -552,6 +651,20 @@ main (int argc, char **argv)
         }
     }
 
+  if (grouping != GM_NONE && output_option_used)
+    {
+      /* TODO: improve wording? */
+      error (0, 0, _("grouping can not be combined with -c/-d/-D/-u"));
+      usage (EXIT_FAILURE);
+    }
+
+  if (grouping != GM_NONE && countmode != count_none)
+    {
+      error (0, 0,
+           _("grouping and printing repeat counts is meaningless"));
+      usage (EXIT_FAILURE);
+    }
+
   if (countmode == count_occurrences && output_later_repeated)
     {
       error (0, 0,
diff --git a/tests/misc/uniq.pl b/tests/misc/uniq.pl
index 140a49b..8933a5c 100755
--- a/tests/misc/uniq.pl
+++ b/tests/misc/uniq.pl
@@ -199,6 +199,46 @@ my @Tests =
  # Check that --zero-terminated is synonymous with -z.
  ['123', '--zero-terminated', {IN=>"a\na\nb"}, {OUT=>"a\na\nb\0"}],
  ['124', '--zero-terminated', {IN=>"a\0a\0b"}, {OUT=>"a\0b\0"}],
+
+ # Check grouping
+ ['125', '--group=prepend', {IN=>"a\na\nb\n"}, {OUT=>"\na\na\n\nb\n"}],
+ ['126', '--group=append',  {IN=>"a\na\nb\n"}, {OUT=>"a\na\n\nb\n\n"}],
+ ['127', '--group=separate',{IN=>"a\na\nb\n"}, {OUT=>"a\na\n\nb\n"}],
+ # no explicit grouping = separate
+ ['128', '--group',         {IN=>"a\na\nb\n"}, {OUT=>"a\na\n\nb\n"}],
+ ['129', '--group=both',    {IN=>"a\na\nb\n"}, {OUT=>"\na\na\n\nb\n\n"}],
+ # Grouping in the special case of a single group
+ ['130', '--group=prepend', {IN=>"a\na\n"}, {OUT=>"\na\na\n"}],
+ ['131', '--group=append',  {IN=>"a\na\n"}, {OUT=>"a\na\n\n"}],
+ ['132', '--group=separate',{IN=>"a\na\n"}, {OUT=>"a\na\n"}],
+ ['133', '--group',         {IN=>"a\na\n"}, {OUT=>"a\na\n"}],
+ # Grouping with empty input - should never print anything
+ ['134', '--group=prepend',  {IN=>""}, {OUT=>""}],
+ ['135', '--group=append',   {IN=>""}, {OUT=>""}],
+ ['136', '--group=separate', {IN=>""}, {OUT=>""}],
+ ['137', '--group=both',     {IN=>""}, {OUT=>""}],
+ # Grouping with other options - must fail
+ ['138', '--group -c',       {IN=>""}, {OUT=>""}, {EXIT=>1},
+  {ERR=>"$prog: grouping can not be combined with -c/-d/-D/-u\n" .
+        "Try 'uniq --help' for more information.\n"}],
+ ['139', '--group -d',       {IN=>""}, {OUT=>""}, {EXIT=>1},
+  {ERR=>"$prog: grouping can not be combined with -c/-d/-D/-u\n" .
+        "Try 'uniq --help' for more information.\n"}],
+ ['140', '--group -u',       {IN=>""}, {OUT=>""}, {EXIT=>1},
+  {ERR=>"$prog: grouping can not be combined with -c/-d/-D/-u\n" .
+        "Try 'uniq --help' for more information.\n"}],
+ ['141', '--group -D',       {IN=>""}, {OUT=>""}, {EXIT=>1},
+  {ERR=>"$prog: grouping can not be combined with -c/-d/-D/-u\n" .
+        "Try 'uniq --help' for more information.\n"}],
+ # Grouping with badoption
+ ['142', '--group=badoption',{IN=>""}, {OUT=>""}, {EXIT=>1},
+  {ERR=>"$prog: invalid argument 'badoption' for '--group'\n" .
+        "Valid arguments are:\n" .
+        "  - 'prepend'\n" .
+        "  - 'append'\n" .
+        "  - 'separate'\n" .
+        "  - 'both'\n" .
+        "Try '$prog --help' for more information.\n"}],
 );
 
 # Set _POSIX2_VERSION=199209 in the environment of each obs-plus* test.
-- 
1.7.7.4

Re: [PATCH]: uniq: add "--group" option

Reply via email to