Hello,

On 2022-01-06 7:35 a.m., Pádraig Brady wrote:
Thanks for taking the time to consolidate options/functionality
across different implementations.  This is important for users.
Some notes below...

On 05/01/2022 16:23, Rob Landley wrote:
Around 5 years ago toybox added the -D, -F, and -O options to cut:

     -D  Don't sort/collate selections or match -fF lines without delimiter
     -F  Select fields separated by DELIM regex
     -O  Output delimiter (default one space for -F, input delim for -f)


As I see it, the main functionalities added here:
   - reordering of selected fields
   - adjusted suppression of lines without matching fields
   - regex delimiter support

I see regex support as less important, but still useful.



Attached is a suggestion for initial implementation of "cut -FDO".
It's split into smaller steps to ease review.

The main issue is that the current "cut_fields" and "cut_bytes" are
highly optimized for speed, so I left them as-is and created a secondary
set of 'cut' functions - slower but with additional options.

If this is acceptable, I'll go on to clean up the patches, add more
tests and write documentation.

There are likely some edge-cases regarding regex matching that need to be decided upon (e.g. BRE or ERE, what about BOL/EOL anchors, groups, etc.).

Comments and feedback very welcomed,

regards,
 - assaf

>From dbfdef9a720c8ea9ed1a90a4e4c66aa7e0ed3e1f Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Wed, 5 Jan 2022 13:03:39 -0700
Subject: [PATCH 1/9] cut: set-fields: add no-sort options

---
 src/set-fields.c | 27 +++++++++++++++------------
 src/set-fields.h |  4 +++-
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/set-fields.c b/src/set-fields.c
index e3cce30d9..5e4ee6715 100644
--- a/src/set-fields.c
+++ b/src/set-fields.c
@@ -279,22 +279,25 @@ set_fields (char const *fieldstr, unsigned int options)
                  ? _("missing list of byte/character positions")
                  : _("missing list of fields"));
 
-  qsort (frp, n_frp, sizeof (frp[0]), compare_ranges);
-
-  /* Merge range pairs (e.g. `2-5,3-4' becomes `2-5'). */
-  for (size_t i = 0; i < n_frp; ++i)
+  if (!(options & SETFLD_NO_SORT))
     {
-      for (size_t j = i + 1; j < n_frp; ++j)
+      qsort (frp, n_frp, sizeof (frp[0]), compare_ranges);
+
+      /* Merge range pairs (e.g. `2-5,3-4' becomes `2-5'). */
+      for (size_t i = 0; i < n_frp; ++i)
         {
-          if (frp[j].lo <= frp[i].hi)
+          for (size_t j = i + 1; j < n_frp; ++j)
             {
-              frp[i].hi = MAX (frp[j].hi, frp[i].hi);
-              memmove (frp + j, frp + j + 1, (n_frp - j - 1) * sizeof *frp);
-              n_frp--;
-              j--;
+              if (frp[j].lo <= frp[i].hi)
+                {
+                  frp[i].hi = MAX (frp[j].hi, frp[i].hi);
+                  memmove (frp + j, frp + j + 1, (n_frp - j - 1) * sizeof *frp);
+                  n_frp--;
+                  j--;
+                }
+              else
+                break;
             }
-          else
-            break;
         }
     }
 
diff --git a/src/set-fields.h b/src/set-fields.h
index 7bc9b3afe..9127d9957 100644
--- a/src/set-fields.h
+++ b/src/set-fields.h
@@ -34,8 +34,10 @@ enum
 {
   SETFLD_ALLOW_DASH = 0x01,     /* allow single dash meaning 'all fields' */
   SETFLD_COMPLEMENT = 0x02,     /* complement the field list */
-  SETFLD_ERRMSG_USE_POS = 0x04  /* when reporting errors, say 'position' instead
+  SETFLD_ERRMSG_USE_POS = 0x04, /* when reporting errors, say 'position' instead
                                    of 'field' (used with cut -b/-c) */
+  SETFLD_NO_SORT    = 0x08      /* Do not sort the fields; keep duplicated
+                                   and overlapped fields */
 };
 
 /* allocates and initializes the FRP array and N_FRP count */
-- 
2.20.1

>From d5d58eeb0bf5a399b2d65e174c72d0f8c11b2c01 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Wed, 5 Jan 2022 13:04:08 -0700
Subject: [PATCH 2/9] cut: iniitial -D implmentation, currently only with "-f"

---
 src/cut.c | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 156 insertions(+), 5 deletions(-)

diff --git a/src/cut.c b/src/cut.c
index 5143c8bd9..84caad091 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -20,7 +20,9 @@
 /* POSIX changes, bug fixes, long-named options, and cleanup
    by David MacKenzie <d...@gnu.ai.mit.edu>.
 
-   Rewrite cut_fields and cut_bytes -- Jim Meyering.  */
+   Rewrite cut_fields and cut_bytes -- Jim Meyering.
+
+   Match toybox's -D,-F,-O options -- Assaf Gordon. */
 
 #include <config.h>
 
@@ -43,7 +45,8 @@
 #define AUTHORS \
   proper_name ("David M. Ihnat"), \
   proper_name ("David MacKenzie"), \
-  proper_name ("Jim Meyering")
+  proper_name ("Jim Meyering"), \
+  proper_name ("Assaf Gordon")
 
 #define FATAL_ERROR(Message)						\
   do									\
@@ -113,6 +116,15 @@ static char *output_delimiter_string;
 /* True if we have ever read standard input. */
 static bool have_read_stdin;
 
+/* If true use different (but less optimized) code,
+   Used with -F and/or -D.  */
+static bool adv_mode;
+
+/* True if -D is used: allow duplicated output bytes/chars/fields
+   and do not sort the output list */
+static bool allow_duplicates;
+
+
 /* For long options that have no equivalent short option, use a
    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
 enum
@@ -127,6 +139,7 @@ static struct option const longopts[] =
   {"characters", required_argument, NULL, 'c'},
   {"fields", required_argument, NULL, 'f'},
   {"delimiter", required_argument, NULL, 'd'},
+  {"allow-duplicates", required_argument, NULL, 'D'},
   {"only-delimited", no_argument, NULL, 's'},
   {"output-delimiter", required_argument, NULL, OUTPUT_DELIMITER_OPTION},
   {"complement", no_argument, NULL, COMPLEMENT_OPTION},
@@ -158,6 +171,10 @@ Print selected parts of lines from each FILE to standard output.\n\
   -b, --bytes=LIST        select only these bytes\n\
   -c, --characters=LIST   select only these characters\n\
   -d, --delimiter=DELIM   use DELIM instead of TAB for field delimiter\n\
+"), stdout);
+      fputs (_("\
+  -D, --allow-duplicates  keep duplicated bytes/charaters/fields in LIST;\n\
+                            do not sort LIST; implies -s\n\
 "), stdout);
       fputs (_("\
   -f, --fields=LIST       select only these fields;  also print any line\n\
@@ -424,10 +441,127 @@ cut_fields (FILE *stream)
     }
 }
 
+static void
+cut_adv (FILE *stream)
+{
+  char *linebuf = NULL;
+  size_t bufsize = 0;
+  ssize_t len;
+
+  char **fieldpos = NULL ;
+  idx_t alloc_flds = 0;
+
+  /* Minor optimization: save a pointer to the last field pair sentinel
+     (which is always added by set_fields() */
+  struct field_range_pair *last_frp = frp;
+  while ( ! (last_frp->hi==UINTMAX_MAX && last_frp->lo==UINTMAX_MAX) )
+    ++last_frp;
+
+  while (true)
+    {
+      /* Read the entire line */
+      len = getdelim (&linebuf, &bufsize, line_delim, stream);
+      #if 0
+      fprintf(stderr,"Read line, len = %dz\n", len);
+      #endif
+      if (len==-1)
+        {
+          if (ferror (stream) || feof (stream))
+            break;
+          xalloc_die ();
+        }
+
+      /* Chomp */
+      if (len>0 && linebuf[len-1]==line_delim)
+        {
+          linebuf[len-1] = '\0';
+          --len;
+        }
+
+      /* Split into fields */
+      char *p = linebuf;
+      size_t l = len;
+      idx_t fld = 0 ;
+      while (true)
+        {
+          char *endp = memchr (p, delim, l);
+
+          /* NUL-terminate the field if not the last */
+          if (endp)
+            *endp = '\0';
+
+          //fprintf(stderr,"Field %ld: '%s'\n", fld, p);
+
+          /* Store this field */
+          if (fld >= alloc_flds)
+            fieldpos = xpalloc (fieldpos, &alloc_flds, 10, -1, sizeof(char*));
+          fieldpos[fld] = p;
+          fld++;
+
+          if (!endp)
+            break;
+
+          l -= (endp-p+1);
+          p = endp+1;
+        }
+
+
+      bool first = true;
+
+      if (fld>1)
+        {
+          /* Iterate the requested field LIST, and print accordingly */
+          for (struct field_range_pair* r = frp; r != last_frp ; ++r)
+            {
+              /* If open-ended range, print up to the available fields */
+              uintmax_t hi = (r->hi == UINTMAX_MAX) ? fld : r->hi;
+
+              for (uintmax_t i = r->lo - 1 ; i < hi ; ++i )
+                {
+#if 0
+                  fprintf(stderr,"Requested field: %zu\n", i);
+                  fprintf(stderr,"Field %zu: '%s'\n", i, fieldpos[i-1]);
+#endif
+
+                  if (i >=fld)
+                    break;
+
+                  if (!first)
+                    fwrite (output_delimiter_string, sizeof (char),
+                            output_delimiter_length, stdout);
+
+                  fputs (fieldpos[i], stdout);
+                  first = false;
+                }
+            }
+        }
+
+      /* Print non-delimited lines */
+      if (first && fld==1)
+        {
+          if (!suppress_non_delimited)
+            {
+              fputs(linebuf, stdout);
+              putchar (line_delim);
+            }
+          continue;
+        }
+
+      //fprintf(stderr,"end of line\n");
+      putchar (line_delim);
+
+    }
+
+  free (fieldpos);
+  free (linebuf);
+}
+
 static void
 cut_stream (FILE *stream)
 {
-  if (operating_mode == byte_mode)
+  if (adv_mode)
+    cut_adv (stream);
+  else if (operating_mode == byte_mode)
     cut_bytes (stream);
   else
     cut_fields (stream);
@@ -499,7 +633,7 @@ main (int argc, char **argv)
   delim = '\0';
   have_read_stdin = false;
 
-  while ((optc = getopt_long (argc, argv, "b:c:d:f:nsz", longopts, NULL)) != -1)
+  while ((optc = getopt_long (argc, argv, "b:c:d:Df:nsz", longopts, NULL)) != -1)
     {
       switch (optc)
         {
@@ -520,6 +654,11 @@ main (int argc, char **argv)
           spec_list_string = optarg;
           break;
 
+        case 'D':
+          adv_mode = true;
+          allow_duplicates = true;
+          break;
+
         case 'd':
           /* New delimiter. */
           /* Interpret -d '' to mean 'use the NUL byte as the delimiter.'  */
@@ -569,17 +708,29 @@ main (int argc, char **argv)
     FATAL_ERROR (_("an input delimiter may be specified only\
  when operating on fields"));
 
+  if (adv_mode && complement)
+    FATAL_ERROR (_("--complement cannot be used with -D"));
+
+  /* -D implies -s with -f */
+  if (allow_duplicates && operating_mode == field_mode)
+    suppress_non_delimited = true;
+
   if (suppress_non_delimited && operating_mode != field_mode)
     FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
 \tonly when operating on fields"));
 
   set_fields (spec_list_string,
               ( (operating_mode == field_mode) ? 0 : SETFLD_ERRMSG_USE_POS)
-              | (complement ? SETFLD_COMPLEMENT : 0) );
+              | (complement ? SETFLD_COMPLEMENT : 0)
+              | (allow_duplicates ? SETFLD_NO_SORT : 0) );
 
   if (!delim_specified)
     delim = '\t';
 
+  if (adv_mode && line_delim==delim)
+    FATAL_ERROR (_("line-delimiter must differ from field delimiter\
+ with -D"));
+
   if (output_delimiter_string == NULL)
     {
       static char dummy[2];
-- 
2.20.1

>From ec88c34b69f09416e67d971421073c38b61a1fb6 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Wed, 5 Jan 2022 18:37:18 -0700
Subject: [PATCH 3/9] tests: add 'cut -D' tests

---
 tests/misc/cut.pl | 83 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/tests/misc/cut.pl b/tests/misc/cut.pl
index c93d73813..2b4e562f0 100755
--- a/tests/misc/cut.pl
+++ b/tests/misc/cut.pl
@@ -36,6 +36,10 @@ my $inval_pos = "$prog: invalid byte or character range\n$try";
 my $no_endpoint = "$prog: invalid range with no endpoint: -\n$try";
 my $nofield = "$prog: an input delimiter may be specified only when " .
               "operating on fields\n$try";
+my $complement_with_D = "$prog: --complement cannot be used " .
+               "with -D\n$try";
+my $line_field_delim_differ = "$prog: line-delimiter must differ from field " .
+               "delimiter with -D\n$try";
 
 my @Tests =
  (
@@ -227,6 +231,84 @@ my @Tests =
                                          {IN=>"123456\n"}, {OUT=>"1\n"}],
   ['EOL-subsumed-4', '--output-d=: -b1-2,2-3,3-',
                                         {IN=>"1234\n"}, {OUT=>"1234\n"}],
+
+
+  ##
+  ## Repeat some of the above tests for "-f", replacing it with "-D"
+  ## We expecte the exact same results (no overlapping or out-of-order
+  ## ranges in these tests)
+  ##
+  ['D-1', '-D -d:', '-f1,3-', {IN=>"a:b:c\n"}, {OUT=>"a:c\n"}],
+  ['D-2', '-D -d:', '-f1,3-', {IN=>"a:b:c\n"}, {OUT=>"a:c\n"}],
+  ['D-3', qw(-D -d: -f2-), {IN=>"a:b:c\n"}, {OUT=>"b:c\n"}],
+  ['D-4', qw(-D -d: -f4), {IN=>"a:b:c\n"}, {OUT=>"\n"}],
+  ['D-5', qw(-D -d: -f4), {IN=>""}, {OUT=>""}],
+  ['D-a', qw(-D -s -d:), '-f3-', {IN=>"a:b:c\n"}, {OUT=>"c\n"}],
+  ['D-b', qw(-D -s -d:), '-f2,3', {IN=>"a:b:c\n"}, {OUT=>"b:c\n"}],
+  ['D-c', qw(-D -s -d:), '-f1,3', {IN=>"a:b:c\n"}, {OUT=>"a:c\n"}],
+  # Trailing colon should not be output
+  ['D-d', qw(-D -s -d:), '-f1,3', {IN=>"a:b:c:\n"}, {OUT=>"a:c\n"}],
+  ['D-e', qw(-D -s -d:), '-f3-', {IN=>"a:b:c:\n"}, {OUT=>"c:\n"}],
+  ['D-f', qw(-D -s -d:), '-f3-4', {IN=>"a:b:c:\n"}, {OUT=>"c:\n"}],
+  ['D-g', qw(-D -s -d:), '-f3,4', {IN=>"a:b:c:\n"}, {OUT=>"c:\n"}],
+  # Make sure -s suppresses non-delimited lines
+  ['D-h', qw(-D -s -d:), '-f2,3', {IN=>"abc\n"}, {OUT=>""}],
+  #
+  ['D-i', qw(-D -d: -f1-3), {IN=>":::\n"}, {OUT=>"::\n"}],
+  ['D-j', qw(-D -d: -f1-4), {IN=>":::\n"}, {OUT=>":::\n"}],
+  ['D-k', qw(-D -d: -f2-3), {IN=>":::\n"}, {OUT=>":\n"}],
+  ['D-l', qw(-D -d: -f2-4), {IN=>":::\n"}, {OUT=>"::\n"}],
+  ['D-m', qw(-D -s -d: -f1-3), {IN=>":::\n"}, {OUT=>"::\n"}],
+  ['D-n', qw(-D -s -d: -f1-4), {IN=>":::\n"}, {OUT=>":::\n"}],
+  ['D-o', qw(-D -s -d: -f2-3), {IN=>":::\n"}, {OUT=>":\n"}],
+  ['D-p', qw(-D -s -d: -f2-4), {IN=>":::\n"}, {OUT=>"::\n"}],
+  ['D-q', qw(-D -s -d: -f2-4), {IN=>":::\n:\n"}, {OUT=>"::\n\n"}],
+  ['D-r', qw(-D -s -d: -f2-4), {IN=>":::\n:1\n"}, {OUT=>"::\n1\n"}],
+  ['D-s', qw(-D -s -d: -f1-4), {IN=>":::\n:a\n"}, {OUT=>":::\n:a\n"}],
+  ['D-t', qw(-D -s -d: -f3-), {IN=>":::\n:1\n"}, {OUT=>":\n\n"}],
+  # Make sure it handles empty input properly, with and without -s.
+  ['D-u', qw(-D -s -f3-), {IN=>""}, {OUT=>""}],
+  ['D-o-delim', qw(-D -d: --out=_), '-f2,3', {IN=>"a:b:c\n"}, {OUT=>"b_c\n"}],
+  ['D-nul-idelim', qw(-D -d '' --out=_), '-f2,3', {IN=>"a\0b\0c\n"}, {OUT=>"b_c\n"}],
+  ['D-nul-odelim', qw(-D -d: --out=), '-f2,3', {IN=>"a:b:c\n"}, {OUT=>"b\0c\n"}],
+  ['D-multichar-od', qw(-D -d: --out=_._), '-f2,3', {IN=>"a:b:c\n"},
+   {OUT=>"b_._c\n"}],
+  ['D-newline-2', '-D -f1-', {IN=>""}, {OUT=>""}],
+  ['D-newline-3', '-D -d:', '-f1', {IN=>"a:1\nb:2\n"}, {OUT=>"a\nb\n"}],
+  ['D-newline-4', '-D -d:', '-f1', {IN=>"a:1\nb:2"}, {OUT=>"a\nb\n"}],
+  ['D-newline-5', '-D -d:', '-f2', {IN=>"a:1\nb:2\n"}, {OUT=>"1\n2\n"}],
+  ['D-newline-6', '-D -d:', '-f2', {IN=>"a:1\nb:2"}, {OUT=>"1\n2\n"}],
+  ['D-newline-7', '-D -s', '-d:', '-f1', {IN=>"a:1\nb:2"}, {OUT=>"a\nb\n"}],
+  ['D-newline-8', '-D -s', '-d:', '-f1', {IN=>"a:1\nb:2\n"}, {OUT=>"a\nb\n"}],
+  ['D-newline-9', '-D -s', '-d:', '-f1', {IN=>"a1\nb2"}, {OUT=>""}],
+  ['D-newline-10', '-D -s', '-d:', '-f1,2', {IN=>"a:1\nb:2"}, {OUT=>"a:1\nb:2\n"}],
+  ['D-newline-11', '-D -s', '-d:', '-f1,2', {IN=>"a:1\nb:2\n"}, {OUT=>"a:1\nb:2\n"}],
+  ['D-newline-12', '-D -s', '-d:', '-f1', {IN=>"a:1\nb:"}, {OUT=>"a\nb\n"}],
+  ['D-newline-13', '-D -d:', '-f1-', {IN=>"a1:\n:"}, {OUT=>"a1:\n:\n"}],
+  ## 'newlines-14' to 'newline-26' are not supported - using '\n'
+  ## for both line and field delimiter.
+  ['D-zerot-3', '-D -z -f1-', {IN=>""}, {OUT=>""}],
+  ['D-zerot-4', '-D -z -d:', '-f1', {IN=>"a:1\0b:2"}, {OUT=>"a\0b\0"}],
+  ['D-zerot-5', '-D -z -d:', '-f1-', {IN=>"a1:\0:"}, {OUT=>"a1:\0:\0"}],
+  ## 'zerot-6' uses NUL for both line and field delimiter.
+
+
+  ##
+  ## Test "-D" with duplicated/out-of-order fields
+  ##
+  ['DD-1', '-D -d:', '-f2,3,3,1', {IN=>"a:b:c\n"}, {OUT=>"b:c:c:a\n"}],
+  ['DD-2', '-D -d:', '-f2-,1',    {IN=>"a:b:c\n"}, {OUT=>"b:c:a\n"}],
+  ['DD-3', '-D -d:', '-f1-,1',    {IN=>"a:b:c\n"}, {OUT=>"a:b:c:a\n"}],
+  ['DD-4', '-D -d:', '-f1,1-',    {IN=>"a:b:c\n"}, {OUT=>"a:a:b:c\n"}],
+  ['DD-5', '-D -d:', '-f-3,-2',   {IN=>"a:b:c\n"}, {OUT=>"a:b:c:a:b\n"}],
+  ['DD-6', '-D -d:', '-f-3,2-',   {IN=>"a:b:c:d\n"}, {OUT=>"a:b:c:b:c:d\n"}],
+
+  ## Check -D related errors
+  ['DD-err-2', '--complement -D -f2', {ERR=>$complement_with_D}, {EXIT => 1} ],
+  ['DD-err-3', "-D -f2 -d'\n'",  {ERR=>$line_field_delim_differ}, {EXIT => 1} ],
+  ['DD-err-4', "-D -f2 -d '' -z", {ERR=>$line_field_delim_differ}, {EXIT => 1} ],
+
+
  );
 
 if ($mb_locale ne 'C')
@@ -246,6 +328,7 @@ if ($mb_locale ne 'C')
   }
 
 
+
 @Tests = triple_test \@Tests;
 
 my $save_temps = $ENV{DEBUG};
-- 
2.20.1

>From 86d0a1f5ba2f195c3c398573cf8d377ea8111456 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Thu, 6 Jan 2022 12:24:40 -0700
Subject: [PATCH 4/9] cut: extract 'cut -D -f' to a separate function

---
 src/cut.c | 170 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 93 insertions(+), 77 deletions(-)

diff --git a/src/cut.c b/src/cut.c
index 84caad091..369c47856 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -124,6 +124,9 @@ static bool adv_mode;
    and do not sort the output list */
 static bool allow_duplicates;
 
+/* Minor optimization: save a pointer to the last field pair sentinel
+   (which is always added by set_fields() */
+static struct field_range_pair *last_frp;
 
 /* For long options that have no equivalent short option, use a
    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
@@ -441,6 +444,87 @@ cut_fields (FILE *stream)
     }
 }
 
+static bool
+cut_adv_fields (char* linebuf, size_t len)
+{
+  static char **fieldpos = NULL ;
+  static idx_t alloc_flds = 0;
+
+  /* Split into fields */
+  char *p = linebuf;
+  size_t l = len;
+  idx_t fld = 0 ;
+  while (true)
+    {
+      char *endp = memchr (p, delim, l);
+
+      /* NUL-terminate the field if not the last */
+      if (endp)
+        *endp = '\0';
+
+      //fprintf(stderr,"Field %ld: '%s'\n", fld, p);
+
+      /* Store this field */
+      if (fld >= alloc_flds)
+        fieldpos = xpalloc (fieldpos, &alloc_flds, 10, -1, sizeof(char*));
+      fieldpos[fld] = p;
+      fld++;
+
+      if (!endp)
+        break;
+
+      l -= (endp-p+1);
+      p = endp+1;
+    }
+
+
+  bool output = false;
+
+  if (fld>1)
+    {
+      /* Iterate the requested field LIST, and print accordingly */
+      for (struct field_range_pair* r = frp; r != last_frp ; ++r)
+        {
+          /* If open-ended range, print up to the available fields */
+          uintmax_t hi = (r->hi == UINTMAX_MAX) ? fld : r->hi;
+
+          for (uintmax_t i = r->lo - 1 ; i < hi ; ++i )
+            {
+#if 0
+              fprintf(stderr,"Requested field: %zu\n", i);
+              fprintf(stderr,"Field %zu: '%s'\n", i, fieldpos[i-1]);
+#endif
+
+              if (i >=fld)
+                break;
+
+              if (output)
+                fwrite (output_delimiter_string, sizeof (char),
+                        output_delimiter_length, stdout);
+
+              fputs (fieldpos[i], stdout);
+              output = true;
+            }
+        }
+    }
+
+  /* Print non-delimited lines */
+  if (!output && fld==1)
+    {
+      if (!suppress_non_delimited)
+        {
+          fputs(linebuf, stdout);
+          output = true;
+        }
+    }
+
+  IF_LINT (free (fieldpos));
+  IF_LINT (fieldpos = NULL);
+  IF_LINT (alloc_flds = 0);
+
+  return output || fld>1;
+}
+
 static void
 cut_adv (FILE *stream)
 {
@@ -448,14 +532,6 @@ cut_adv (FILE *stream)
   size_t bufsize = 0;
   ssize_t len;
 
-  char **fieldpos = NULL ;
-  idx_t alloc_flds = 0;
-
-  /* Minor optimization: save a pointer to the last field pair sentinel
-     (which is always added by set_fields() */
-  struct field_range_pair *last_frp = frp;
-  while ( ! (last_frp->hi==UINTMAX_MAX && last_frp->lo==UINTMAX_MAX) )
-    ++last_frp;
 
   while (true)
     {
@@ -478,81 +554,15 @@ cut_adv (FILE *stream)
           --len;
         }
 
-      /* Split into fields */
-      char *p = linebuf;
-      size_t l = len;
-      idx_t fld = 0 ;
-      while (true)
-        {
-          char *endp = memchr (p, delim, l);
-
-          /* NUL-terminate the field if not the last */
-          if (endp)
-            *endp = '\0';
-
-          //fprintf(stderr,"Field %ld: '%s'\n", fld, p);
-
-          /* Store this field */
-          if (fld >= alloc_flds)
-            fieldpos = xpalloc (fieldpos, &alloc_flds, 10, -1, sizeof(char*));
-          fieldpos[fld] = p;
-          fld++;
-
-          if (!endp)
-            break;
-
-          l -= (endp-p+1);
-          p = endp+1;
-        }
-
 
-      bool first = true;
-
-      if (fld>1)
-        {
-          /* Iterate the requested field LIST, and print accordingly */
-          for (struct field_range_pair* r = frp; r != last_frp ; ++r)
-            {
-              /* If open-ended range, print up to the available fields */
-              uintmax_t hi = (r->hi == UINTMAX_MAX) ? fld : r->hi;
-
-              for (uintmax_t i = r->lo - 1 ; i < hi ; ++i )
-                {
-#if 0
-                  fprintf(stderr,"Requested field: %zu\n", i);
-                  fprintf(stderr,"Field %zu: '%s'\n", i, fieldpos[i-1]);
-#endif
-
-                  if (i >=fld)
-                    break;
-
-                  if (!first)
-                    fwrite (output_delimiter_string, sizeof (char),
-                            output_delimiter_length, stdout);
-
-                  fputs (fieldpos[i], stdout);
-                  first = false;
-                }
-            }
-        }
-
-      /* Print non-delimited lines */
-      if (first && fld==1)
-        {
-          if (!suppress_non_delimited)
-            {
-              fputs(linebuf, stdout);
-              putchar (line_delim);
-            }
-          continue;
-        }
+      bool output = cut_adv_fields (linebuf, len);
 
       //fprintf(stderr,"end of line\n");
-      putchar (line_delim);
+      if (output)
+        putchar (line_delim);
 
     }
 
-  free (fieldpos);
   free (linebuf);
 }
 
@@ -724,6 +734,12 @@ main (int argc, char **argv)
               | (complement ? SETFLD_COMPLEMENT : 0)
               | (allow_duplicates ? SETFLD_NO_SORT : 0) );
 
+  /* Minor optimization: keep a pointer to the sentinel (last) pair */
+  last_frp = frp;
+  while ( ! (last_frp->hi==UINTMAX_MAX && last_frp->lo==UINTMAX_MAX) )
+    ++last_frp;
+
+
   if (!delim_specified)
     delim = '\t';
 
-- 
2.20.1

>From 4f72a9c9b5222ace172a00f898b6fcc96743c252 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Thu, 6 Jan 2022 12:35:58 -0700
Subject: [PATCH 5/9] cut: implement -D with -b

---
 src/cut.c | 38 +++++++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/src/cut.c b/src/cut.c
index 369c47856..ed2e903ab 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -525,6 +525,36 @@ cut_adv_fields (char* linebuf, size_t len)
   return output || fld>1;
 }
 
+
+static bool
+cut_adv_bytes (char* linebuf, size_t len)
+{
+  bool output = false;
+
+  /* Iterate the requested field LIST, and print accordingly */
+  for (struct field_range_pair* r = frp; r != last_frp ; ++r)
+    {
+      /* If open-ended range, print up to the available fields */
+      uintmax_t hi = (r->hi == UINTMAX_MAX) ? len : r->hi;
+
+      if (output_delimiter_specified && output)
+        fwrite (output_delimiter_string, sizeof (char),
+                output_delimiter_length, stdout);
+
+      for (uintmax_t i = r->lo - 1 ; i < hi ; ++i )
+        {
+          if (i >=len)
+            break;
+
+          putchar (linebuf[i]);
+
+          output = true;
+        }
+    }
+
+  return true;
+}
+
 static void
 cut_adv (FILE *stream)
 {
@@ -555,12 +585,14 @@ cut_adv (FILE *stream)
         }
 
 
-      bool output = cut_adv_fields (linebuf, len);
+      bool output ;
+      if (operating_mode == byte_mode)
+        output = cut_adv_bytes (linebuf, len);
+      else
+        output = cut_adv_fields (linebuf, len);
 
-      //fprintf(stderr,"end of line\n");
       if (output)
         putchar (line_delim);
-
     }
 
   free (linebuf);
-- 
2.20.1

>From abe2de76423cf9fd57a44f80351188e05225f3ae Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Thu, 6 Jan 2022 12:45:37 -0700
Subject: [PATCH 6/9] tests: add 'cut -D -b' tests

---
 tests/misc/cut.pl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/misc/cut.pl b/tests/misc/cut.pl
index 2b4e562f0..e644963f7 100755
--- a/tests/misc/cut.pl
+++ b/tests/misc/cut.pl
@@ -308,6 +308,14 @@ my @Tests =
   ['DD-err-3', "-D -f2 -d'\n'",  {ERR=>$line_field_delim_differ}, {EXIT => 1} ],
   ['DD-err-4', "-D -f2 -d '' -z", {ERR=>$line_field_delim_differ}, {EXIT => 1} ],
 
+  ##
+  ## Check "-D" with "-b/-c"
+  ##
+  ['DB-out-delim1', '-D -c1-3,5-', '--output-d=:', {IN=>"abcdefg\n"},
+   {OUT=>"abc:efg\n"}],
+  # A totally overlapped field WITH "-D" does change the output:
+  ['DB-out-delim2', '-D -c1-3,2,5-', '--output-d=:', {IN=>"abcdefg\n"},
+   {OUT=>"abc:b:efg\n"}],
 
  );
 
-- 
2.20.1

>From 76e60af5ba90fd5c6150639bd19a7a02359df3cb Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Thu, 6 Jan 2022 12:51:03 -0700
Subject: [PATCH 7/9] cut: add -O short-option for --output-delimiter

---
 src/cut.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/cut.c b/src/cut.c
index ed2e903ab..4e86953d3 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -132,8 +132,7 @@ static struct field_range_pair *last_frp;
    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
 enum
 {
-  OUTPUT_DELIMITER_OPTION = CHAR_MAX + 1,
-  COMPLEMENT_OPTION
+  COMPLEMENT_OPTION = CHAR_MAX + 1,
 };
 
 static struct option const longopts[] =
@@ -144,7 +143,7 @@ static struct option const longopts[] =
   {"delimiter", required_argument, NULL, 'd'},
   {"allow-duplicates", required_argument, NULL, 'D'},
   {"only-delimited", no_argument, NULL, 's'},
-  {"output-delimiter", required_argument, NULL, OUTPUT_DELIMITER_OPTION},
+  {"output-delimiter", required_argument, NULL, 'O'},
   {"complement", no_argument, NULL, COMPLEMENT_OPTION},
   {"zero-terminated", no_argument, NULL, 'z'},
   {GETOPT_HELP_OPTION_DECL},
@@ -191,7 +190,7 @@ Print selected parts of lines from each FILE to standard output.\n\
 "), stdout);
       fputs (_("\
   -s, --only-delimited    do not print lines not containing delimiters\n\
-      --output-delimiter=STRING  use STRING as the output delimiter\n\
+  -O, --output-delimiter=STRING  use STRING as the output delimiter\n\
                             the default is to use the input delimiter\n\
 "), stdout);
       fputs (_("\
@@ -675,7 +674,7 @@ main (int argc, char **argv)
   delim = '\0';
   have_read_stdin = false;
 
-  while ((optc = getopt_long (argc, argv, "b:c:d:Df:nsz", longopts, NULL)) != -1)
+  while ((optc = getopt_long (argc, argv, "b:c:d:Df:nO:sz", longopts, NULL)) != -1)
     {
       switch (optc)
         {
@@ -710,7 +709,7 @@ main (int argc, char **argv)
           delim_specified = true;
           break;
 
-        case OUTPUT_DELIMITER_OPTION:
+        case 'O':
           output_delimiter_specified = true;
           /* Interpret --output-delimiter='' to mean
              'use the NUL byte as the delimiter.'  */
-- 
2.20.1

>From 707f4c3588bf265d4145d1c3fceb1d3d6806c6c6 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Thu, 6 Jan 2022 15:36:43 -0700
Subject: [PATCH 8/9] cut: implement -F

---
 src/cut.c | 124 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 117 insertions(+), 7 deletions(-)

diff --git a/src/cut.c b/src/cut.c
index 4e86953d3..7da0c131f 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -32,10 +32,12 @@
 #include <sys/types.h>
 #include "system.h"
 
+#include "die.h"
 #include "error.h"
 #include "fadvise.h"
 #include "getndelim2.h"
 #include "hash.h"
+#include "regex.h"
 
 #include "set-fields.h"
 
@@ -128,6 +130,14 @@ static bool allow_duplicates;
    (which is always added by set_fields() */
 static struct field_range_pair *last_frp;
 
+/* With "-F", the input delimiter (-d) can be a regex string, not
+   just a single character. Keep the string here. */
+static char* delim_str;
+
+/* With "-F", this is the compiled regex */
+static bool delim_use_regex;
+static struct re_pattern_buffer delim_regex;
+
 /* For long options that have no equivalent short option, use a
    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
 enum
@@ -140,6 +150,7 @@ static struct option const longopts[] =
   {"bytes", required_argument, NULL, 'b'},
   {"characters", required_argument, NULL, 'c'},
   {"fields", required_argument, NULL, 'f'},
+  {"regex-fields", required_argument, NULL, 'F'},
   {"delimiter", required_argument, NULL, 'd'},
   {"allow-duplicates", required_argument, NULL, 'D'},
   {"only-delimited", no_argument, NULL, 's'},
@@ -183,6 +194,10 @@ Print selected parts of lines from each FILE to standard output.\n\
                             that contains no delimiter character, unless\n\
                             the -s option is specified\n\
   -n                      (ignored)\n\
+"), stdout);
+      fputs (_("\
+  -F, --regex-fields=LIST select only these fields; treat -d DELIM as a\n\
+ regular expression delimiter\n\
 "), stdout);
       fputs (_("\
       --complement        complement the set of selected bytes, characters\n\
@@ -200,7 +215,7 @@ Print selected parts of lines from each FILE to standard output.\n\
       fputs (VERSION_OPTION_DESCRIPTION, stdout);
       fputs (_("\
 \n\
-Use one, and only one of -b, -c or -f.  Each LIST is made up of one\n\
+Use one, and only one of -b, -c, -f or -F.  Each LIST is made up of one\n\
 range, or many ranges separated by commas.  Selected input is written\n\
 in the same order that it is read, and is written exactly once.\n\
 "), stdout);
@@ -451,11 +466,56 @@ cut_adv_fields (char* linebuf, size_t len)
 
   /* Split into fields */
   char *p = linebuf;
+  char *endp ;
   size_t l = len;
   idx_t fld = 0 ;
   while (true)
     {
-      char *endp = memchr (p, delim, l);
+      if (delim_use_regex)
+        {
+          #if 0
+          fprintf(stderr,"Running regex exec, beg = '%c'\n", *p);
+          #endif
+
+          struct re_registers regs;
+          memset (&regs, 0, sizeof regs);
+          regoff_t i = re_search (&delim_regex, p, l, 0, l, &regs);
+
+          if (i == -2)
+            FATAL_ERROR (_("regex search failed"));
+
+          #if 0
+          fprintf(stderr,"re_search returned %ld, num-reg = %zu\n", i, regs.num_regs);
+          for (int j=0;j<regs.num_regs;++j)
+            {
+              regoff_t s = regs.start[j];
+              regoff_t e = regs.end[j];
+              fprintf(stderr,"   reg[%ld].start = '%c'  end = '%c'\n", j, *(p+s), *(p+e)) ;
+            }
+          #endif
+
+          if (i >= 0)
+            {
+              /* The matched regex register is the location of the
+                 delimiting string.  Add NUL at the start (to
+                 terminate the preceeding field) and set ENDP to the
+                 end of it (one octet before the next field) */
+              const regoff_t s = regs.start[0];
+              const regoff_t e = regs.end[0];
+              *(p+s) = '\0';
+              endp = p+e-1;
+            }
+          else
+            {
+              endp = 0;
+            }
+          free (regs.start);
+          free (regs.end);
+        }
+      else
+        {
+         endp = memchr (p, delim, l);
+        }
 
       /* NUL-terminate the field if not the last */
       if (endp)
@@ -674,7 +734,7 @@ main (int argc, char **argv)
   delim = '\0';
   have_read_stdin = false;
 
-  while ((optc = getopt_long (argc, argv, "b:c:d:Df:nO:sz", longopts, NULL)) != -1)
+  while ((optc = getopt_long (argc, argv, "b:c:d:Df:F:nO:sz", longopts, NULL)) != -1)
     {
       switch (optc)
         {
@@ -695,6 +755,16 @@ main (int argc, char **argv)
           spec_list_string = optarg;
           break;
 
+        case 'F':
+          /* Build the field list. */
+          if (operating_mode != undefined_mode)
+            FATAL_ERROR (_("only one type of list may be specified"));
+          operating_mode = field_mode;
+          adv_mode = true;
+          spec_list_string = optarg;
+          delim_use_regex = true;
+          break;
+
         case 'D':
           adv_mode = true;
           allow_duplicates = true;
@@ -702,10 +772,7 @@ main (int argc, char **argv)
 
         case 'd':
           /* New delimiter. */
-          /* Interpret -d '' to mean 'use the NUL byte as the delimiter.'  */
-          if (optarg[0] != '\0' && optarg[1] != '\0')
-            FATAL_ERROR (_("the delimiter must be a single character"));
-          delim = optarg[0];
+          delim_str = xstrdup (optarg);
           delim_specified = true;
           break;
 
@@ -742,6 +809,49 @@ main (int argc, char **argv)
         }
     }
 
+  if (operating_mode == field_mode && delim_use_regex && !delim_specified)
+    {
+      /* Default delimiter for -F (regex delimiter) is whitespace */
+      delim_str = xstrdup("[ \t]+");
+      delim_specified = true;
+    }
+
+  /* '-d DELIM' validation */
+  if (delim_specified)
+    {
+      if (operating_mode == field_mode && delim_use_regex)
+        {
+          /* in -F/--regex-field mode, DELIM can be a non-empty string and
+             a valid regex. */
+          if (strlen (delim_str)==0)
+            FATAL_ERROR (_("delimiter string must not be empty with -F"));
+
+          /* FIXME: What are the correct flags compared to busybox/toybox? */
+          re_set_syntax (RE_SYNTAX_EGREP | RE_HAT_LISTS_NOT_NEWLINE | RE_ICASE);
+          memset (&delim_regex, 0, sizeof delim_regex);
+          const char *s = re_compile_pattern (delim_str, strlen (delim_str), &delim_regex);
+          if (s)
+            die (EXIT_FAILURE, 0, _("regex error: %s"), s);
+
+          /* Default output delimiter is one space */
+          if (!output_delimiter_specified)
+            {
+              output_delimiter_specified = true;
+              output_delimiter_string = xstrdup (" ");
+              output_delimiter_length = 1 ;
+            }
+
+        }
+      else
+        {
+          /* Interpret -d '' to mean 'use the NUL byte as the delimiter.'  */
+          if (delim_str[0] != '\0' && delim_str[1] != '\0')
+            FATAL_ERROR (_("the delimiter must be a single character"));
+          delim = delim_str[0];
+        }
+    }
+
+
   if (operating_mode == undefined_mode)
     FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
 
-- 
2.20.1

>From be3b1cc0acd9ffdf3feb800c14768859e49526a3 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Thu, 6 Jan 2022 15:48:07 -0700
Subject: [PATCH 9/9] tests: add 'cut -F' tests

---
 tests/misc/cut.pl | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/misc/cut.pl b/tests/misc/cut.pl
index e644963f7..79ef49d40 100755
--- a/tests/misc/cut.pl
+++ b/tests/misc/cut.pl
@@ -317,6 +317,22 @@ my @Tests =
   ['DB-out-delim2', '-D -c1-3,2,5-', '--output-d=:', {IN=>"abcdefg\n"},
    {OUT=>"abc:b:efg\n"}],
 
+  ##
+  ## Check "-F" (regex delimiter)
+  ##
+  ['RE-1', "-d [0-9]+ -F3,1,2,1", {IN=>"abc123def456efg\n"},
+   {OUT=>"abc def efg\n"}],
+  ['RE-2', "-d [0-9]+ -F3,1,2,1 -D", {IN=>"abc123def456efg\n"},
+   {OUT=>"efg abc def abc\n"}],
+  ['RE-3', "-d [0-9]+ -F3,1,2,1 -D -O:", {IN=>"abc123def456efg\n"},
+   {OUT=>"efg:abc:def:abc\n"}],
+  ['RE-4', " -F3,1,2,1 -D", {IN=>"abc \t def\t\tefg\n"},
+   {OUT=>"efg abc def abc\n"}],
+  ['RE-5', " -F3,1,2,1 -D -O:", {IN=>"abc \t def\t\tefg\n"},
+   {OUT=>"efg:abc:def:abc\n"}],
+  ['RE-6', " -F3,1,2,1 -O:", {IN=>"abc \t def\t\tefg\n"},
+   {OUT=>"abc:def:efg\n"}],
+
  );
 
 if ($mb_locale ne 'C')
-- 
2.20.1

Reply via email to