uniq - check specific fields

Assaf Gordon Thu, 07 Feb 2013 09:13:23 -0800

Hello,

Attached is a proof-of-concept patch to add "--check-fields=N" to uniq, 
allowing uniq'ing by specific fields.
(Trying a different approach at promoting csplit-by-field [1] :) ).


It works just like 'check-chars' but on fields, and if not used, it does not 
affect the program flow.
===
    # input file, every whole-line is uniq
    $ cat input.txt 
    A 1 z
    A 1 y
    A 2 x
    B 2 w
    B 3 w
    C 3 w
    C 4 w
    
    # regular uniq
    $ uniq -c input.txt 
          1 A 1 z
          1 A 1 y
          1 A 2 x
          1 B 2 w
          1 B 3 w
          1 C 3 w
          1 C 4 w
          
    # Stop after 1 field
    $ uniq -c --check-fields 1 input.txt 
          3 A 1 z
          2 B 2 w
          2 C 3 w
    
    # Stop after 2 fields
    $ uniq -c --check-fields 2 input.txt 
          2 A 1 z
          1 A 2 x
          1 B 2 w
          1 B 3 w
          1 C 3 w
          1 C 4 w
    
    # Skip the first field and check 1 field (effectively, uniq on field 2)
    $ uniq -c  --skip-fields 1 --check-fields 1 input.txt 
          2 A 1 z
          2 A 2 x
          2 B 3 w
          1 C 4 w
    
    # "--field" is convenience shortcut for skip&check fields 
    $ uniq -c --field 2 input.txt 
          2 A 1 z
          2 A 2 x
          2 B 3 w
          1 C 4 w
    $ uniq -c --field 3 input.txt 
          1 A 1 z
          1 A 1 y
          1 A 2 x
          4 B 2 w
===

What do you think ?

 -gordon

[1] http://lists.gnu.org/archive/html/coreutils/2013-02/msg00015.html

>From 08ee89a89d6912c5872a1785b9079d943ad71623 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Thu, 7 Feb 2013 11:46:22 -0500
Subject: [PATCH] uniq: support uniq-by-field

src/uniq.c: add --field and --check-fields=N support
---
 src/uniq.c |   68 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 67 insertions(+), 1 deletions(-)

diff --git a/src/uniq.c b/src/uniq.c
index 5efdad7..b7c3dc8 100644
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -63,6 +63,9 @@ static size_t skip_chars;
 /* Number of chars to compare. */
 static size_t check_chars;
 
+/* Number of fields to compare */
+static size_t check_fields;
+
 enum countmode
 {
   count_occurrences,		/* -c Print count before output lines. */
@@ -108,6 +111,13 @@ static enum delimit_method const delimit_method_map[] =
 /* Select whether/how to delimit groups of duplicate lines.  */
 static enum delimit_method delimit_groups;
 
+/* For long options that have no equivalent short option, use a
+   non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
+enum
+{
+  UNIQ_FIELD = CHAR_MAX + 1,
+};
+
 static struct option const longopts[] =
 {
   {"count", no_argument, NULL, 'c'},
@@ -118,6 +128,8 @@ static struct option const longopts[] =
   {"skip-fields", required_argument, NULL, 'f'},
   {"skip-chars", required_argument, NULL, 's'},
   {"check-chars", required_argument, NULL, 'w'},
+  {"check-fields", required_argument, NULL, 'y'},
+  {"field", required_argument, NULL, UNIQ_FIELD},
   {"zero-terminated", no_argument, NULL, 'z'},
   {GETOPT_HELP_OPTION_DECL},
   {GETOPT_VERSION_OPTION_DECL},
@@ -153,6 +165,8 @@ With no options, matching lines are merged to the first occurrence.\n\
                         delimit-method={none(default),prepend,separate}\n\
                         Delimiting is done with blank lines\n\
   -f, --skip-fields=N   avoid comparing the first N fields\n\
+      --field=N         check only field N.\n\
+                        equivalent to '-f (N-1) -y 1'\n\
   -i, --ignore-case     ignore differences in case when comparing\n\
   -s, --skip-chars=N    avoid comparing the first N characters\n\
   -u, --unique          only print unique lines\n\
@@ -160,6 +174,7 @@ With no options, matching lines are merged to the first occurrence.\n\
 "), stdout);
      fputs (_("\
   -w, --check-chars=N   compare no more than N characters in lines\n\
+  -y, --check-fields=N  compare no more than N fields in lines\n\
 "), stdout);
      fputs (HELP_OPTION_DESCRIPTION, stdout);
      fputs (VERSION_OPTION_DESCRIPTION, stdout);
@@ -225,6 +240,34 @@ find_field (struct linebuffer const *line)
   return line->buffer + i;
 }
 
+/* Given a string and maximum length,
+ * returns the position after skipping 'check_fields' fields,
+ * or maximum length (if not enough fields on the input string) */
+static size_t _GL_ATTRIBUTE_PURE
+check_fields_length (const char* str, size_t maxlen)
+{
+  size_t count;
+  size_t i = 0;
+
+/*  fputs("check_fields_length(str='",stderr);
+  fwrite(str,sizeof(char),maxlen,stderr);
+  fprintf(stderr,"' len=%zu, check_fields=%zu)\n",maxlen,check_fields);*/
+
+  for (count = 0; count < check_fields && i < maxlen; count++)
+    {
+      while (i < maxlen && isblank (to_uchar (str[i])))
+        i++;
+      while (i < maxlen && !isblank (to_uchar (str[i])))
+        i++;
+    }
+
+/*  fprintf(stderr,"  result= '");
+  fwrite(str,sizeof(char),i,stderr);
+  fputs("'\n",stderr);*/
+
+  return i;
+}
+
 /* Return false if two strings OLD and NEW match, true if not.
    OLD and NEW point not to the beginnings of the lines
    but rather to the beginnings of the fields to compare.
@@ -312,6 +355,8 @@ check_file (const char *infile, const char *outfile, char delimiter)
             break;
           thisfield = find_field (thisline);
           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
+          if (check_fields)
+            thislen = check_fields_length (thisfield, thislen);
           if (prevline->length == 0
               || different (thisfield, prevfield, thislen, prevlen))
             {
@@ -335,6 +380,8 @@ check_file (const char *infile, const char *outfile, char delimiter)
         goto closefiles;
       prevfield = find_field (prevline);
       prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
+      if (check_fields)
+        prevlen = check_fields_length (prevfield, prevlen);
 
       while (!feof (stdin))
         {
@@ -349,6 +396,8 @@ check_file (const char *infile, const char *outfile, char delimiter)
             }
           thisfield = find_field (thisline);
           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
+          if (check_fields)
+            thislen = check_fields_length (thisfield, thislen);
           match = !different (thisfield, prevfield, thislen, prevlen);
           match_count += match;
 
@@ -429,6 +478,7 @@ main (int argc, char **argv)
   skip_chars = 0;
   skip_fields = 0;
   check_chars = SIZE_MAX;
+  check_fields = 0;
   output_unique = output_first_repeated = true;
   output_later_repeated = false;
   countmode = count_none;
@@ -443,7 +493,7 @@ main (int argc, char **argv)
       if (optc == -1
           || (posixly_correct && nfiles != 0)
           || ((optc = getopt_long (argc, argv,
-                                   "-0123456789Dcdf:is:uw:z", longopts, NULL))
+                                   "-0123456789Dcdf:is:uw:y:z", longopts, NULL))
               == -1))
         {
           if (argc <= optind)
@@ -539,6 +589,22 @@ main (int argc, char **argv)
                                   N_("invalid number of bytes to compare"));
           break;
 
+        case 'y':
+          check_fields = size_opt (optarg,
+                                  N_("invalid number of fields to compare"));
+          if (check_fields==0)
+            error (EXIT_FAILURE, 0, N_("invalid number of fields to compare"));
+          break;
+
+        case UNIQ_FIELD:
+          skip_fields = size_opt (optarg,
+                                  N_("invalid number of field to compare"));
+          if (skip_fields==0)
+            error (EXIT_FAILURE, 0, N_("invalid number of field to compare"));
+          --skip_fields; /* users specify 1-based values */
+          check_fields = 1;
+          break;
+
         case 'z':
           delimiter = '\0';
           break;
-- 
1.7.7.4

uniq - check specific fields

Reply via email to