Re: cut -DF

Assaf Gordon Tue, 25 Jan 2022 00:47:19 -0800

Hello,

Here's an updated patch for "cut -DF".

Since it's a new code path, it opens the possibility of finallysupporting multibyte characters with "cut -c".



comments very welcomed,
 - assaf

 [PATCH 01/18] cut: set-fields: add no-sort options
 [PATCH 02/18] cut: iniitial -D implmentation, currently only with
 [PATCH 03/18] tests: add 'cut -D' tests
 [PATCH 04/18] cut: extract 'cut -D -f' to a separate function
 [PATCH 05/18] cut: implement -D with -b
 [PATCH 06/18] tests: add 'cut -D -b' tests
 [PATCH 07/18] cut: add -O short-option for --output-delimiter
 [PATCH 08/18] cut: implement -F
 [PATCH 09/18] tests: add 'cut -F' tests
 [PATCH 10/18] cut: extract cut-fields into separate functions
 [PATCH 11/18] cut: implement multibyte -c/--characters
 [PATCH 12/18] cut: change -F regex syntax to BRE
 [PATCH 13/18] cut: change -D long-option equivalent
 [PATCH 14/18] doc: mention 'cut -D' in NEWS
 [PATCH 15/18] doc: mention 'cut -F' in NEWS
 [PATCH 16/18] doc: mention 'cut -O' in NEWS
 [PATCH 17/18] doc: mention multibyte 'cut -c' in NEWS
 [PATCH 18/18] doc: expand 'cut' section

From 2557ced8cb30655ef55c8532d814798172b5c392 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Wed, 5 Jan 2022 13:03:39 -0700
Subject: [PATCH 01/18] cut: set-fields: add no-sort options

---
 src/set-fields.c | 27 +++++++++++++++------------
 src/set-fields.h |  4 +++-
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/set-fields.c b/src/set-fields.c
index e3cce30d9..5e4ee6715 100644
--- a/src/set-fields.c
+++ b/src/set-fields.c
@@ -279,22 +279,25 @@ set_fields (char const *fieldstr, unsigned int options)
                  ? _("missing list of byte/character positions")
                  : _("missing list of fields"));
 
-  qsort (frp, n_frp, sizeof (frp[0]), compare_ranges);
-
-  /* Merge range pairs (e.g. `2-5,3-4' becomes `2-5'). */
-  for (size_t i = 0; i < n_frp; ++i)
+  if (!(options & SETFLD_NO_SORT))
     {
-      for (size_t j = i + 1; j < n_frp; ++j)
+      qsort (frp, n_frp, sizeof (frp[0]), compare_ranges);
+
+      /* Merge range pairs (e.g. `2-5,3-4' becomes `2-5'). */
+      for (size_t i = 0; i < n_frp; ++i)
         {
-          if (frp[j].lo <= frp[i].hi)
+          for (size_t j = i + 1; j < n_frp; ++j)
             {
-              frp[i].hi = MAX (frp[j].hi, frp[i].hi);
-              memmove (frp + j, frp + j + 1, (n_frp - j - 1) * sizeof *frp);
-              n_frp--;
-              j--;
+              if (frp[j].lo <= frp[i].hi)
+                {
+                  frp[i].hi = MAX (frp[j].hi, frp[i].hi);
+                  memmove (frp + j, frp + j + 1, (n_frp - j - 1) * sizeof *frp);
+                  n_frp--;
+                  j--;
+                }
+              else
+                break;
             }
-          else
-            break;
         }
     }
 
diff --git a/src/set-fields.h b/src/set-fields.h
index 7bc9b3afe..9127d9957 100644
--- a/src/set-fields.h
+++ b/src/set-fields.h
@@ -34,8 +34,10 @@ enum
 {
   SETFLD_ALLOW_DASH = 0x01,     /* allow single dash meaning 'all fields' */
   SETFLD_COMPLEMENT = 0x02,     /* complement the field list */
-  SETFLD_ERRMSG_USE_POS = 0x04  /* when reporting errors, say 'position' instead
+  SETFLD_ERRMSG_USE_POS = 0x04, /* when reporting errors, say 'position' instead
                                    of 'field' (used with cut -b/-c) */
+  SETFLD_NO_SORT    = 0x08      /* Do not sort the fields; keep duplicated
+                                   and overlapped fields */
 };
 
 /* allocates and initializes the FRP array and N_FRP count */
-- 
2.30.2

From 6db6c47aabe5c0ba194cecb1f8f24957b65e1556 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Wed, 5 Jan 2022 13:04:08 -0700
Subject: [PATCH 02/18] cut: iniitial -D implmentation, currently only with
 "-f"

---
 src/cut.c | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 156 insertions(+), 5 deletions(-)

diff --git a/src/cut.c b/src/cut.c
index 5143c8bd9..84caad091 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -20,7 +20,9 @@
 /* POSIX changes, bug fixes, long-named options, and cleanup
    by David MacKenzie <d...@gnu.ai.mit.edu>.
 
-   Rewrite cut_fields and cut_bytes -- Jim Meyering.  */
+   Rewrite cut_fields and cut_bytes -- Jim Meyering.
+
+   Match toybox's -D,-F,-O options -- Assaf Gordon. */
 
 #include <config.h>
 
@@ -43,7 +45,8 @@
 #define AUTHORS \
   proper_name ("David M. Ihnat"), \
   proper_name ("David MacKenzie"), \
-  proper_name ("Jim Meyering")
+  proper_name ("Jim Meyering"), \
+  proper_name ("Assaf Gordon")
 
 #define FATAL_ERROR(Message)						\
   do									\
@@ -113,6 +116,15 @@ static char *output_delimiter_string;
 /* True if we have ever read standard input. */
 static bool have_read_stdin;
 
+/* If true use different (but less optimized) code,
+   Used with -F and/or -D.  */
+static bool adv_mode;
+
+/* True if -D is used: allow duplicated output bytes/chars/fields
+   and do not sort the output list */
+static bool allow_duplicates;
+
+
 /* For long options that have no equivalent short option, use a
    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
 enum
@@ -127,6 +139,7 @@ static struct option const longopts[] =
   {"characters", required_argument, NULL, 'c'},
   {"fields", required_argument, NULL, 'f'},
   {"delimiter", required_argument, NULL, 'd'},
+  {"allow-duplicates", required_argument, NULL, 'D'},
   {"only-delimited", no_argument, NULL, 's'},
   {"output-delimiter", required_argument, NULL, OUTPUT_DELIMITER_OPTION},
   {"complement", no_argument, NULL, COMPLEMENT_OPTION},
@@ -158,6 +171,10 @@ Print selected parts of lines from each FILE to standard output.\n\
   -b, --bytes=LIST        select only these bytes\n\
   -c, --characters=LIST   select only these characters\n\
   -d, --delimiter=DELIM   use DELIM instead of TAB for field delimiter\n\
+"), stdout);
+      fputs (_("\
+  -D, --allow-duplicates  keep duplicated bytes/charaters/fields in LIST;\n\
+                            do not sort LIST; implies -s\n\
 "), stdout);
       fputs (_("\
   -f, --fields=LIST       select only these fields;  also print any line\n\
@@ -424,10 +441,127 @@ cut_fields (FILE *stream)
     }
 }
 
+static void
+cut_adv (FILE *stream)
+{
+  char *linebuf = NULL;
+  size_t bufsize = 0;
+  ssize_t len;
+
+  char **fieldpos = NULL ;
+  idx_t alloc_flds = 0;
+
+  /* Minor optimization: save a pointer to the last field pair sentinel
+     (which is always added by set_fields() */
+  struct field_range_pair *last_frp = frp;
+  while ( ! (last_frp->hi==UINTMAX_MAX && last_frp->lo==UINTMAX_MAX) )
+    ++last_frp;
+
+  while (true)
+    {
+      /* Read the entire line */
+      len = getdelim (&linebuf, &bufsize, line_delim, stream);
+      #if 0
+      fprintf(stderr,"Read line, len = %dz\n", len);
+      #endif
+      if (len==-1)
+        {
+          if (ferror (stream) || feof (stream))
+            break;
+          xalloc_die ();
+        }
+
+      /* Chomp */
+      if (len>0 && linebuf[len-1]==line_delim)
+        {
+          linebuf[len-1] = '\0';
+          --len;
+        }
+
+      /* Split into fields */
+      char *p = linebuf;
+      size_t l = len;
+      idx_t fld = 0 ;
+      while (true)
+        {
+          char *endp = memchr (p, delim, l);
+
+          /* NUL-terminate the field if not the last */
+          if (endp)
+            *endp = '\0';
+
+          //fprintf(stderr,"Field %ld: '%s'\n", fld, p);
+
+          /* Store this field */
+          if (fld >= alloc_flds)
+            fieldpos = xpalloc (fieldpos, &alloc_flds, 10, -1, sizeof(char*));
+          fieldpos[fld] = p;
+          fld++;
+
+          if (!endp)
+            break;
+
+          l -= (endp-p+1);
+          p = endp+1;
+        }
+
+
+      bool first = true;
+
+      if (fld>1)
+        {
+          /* Iterate the requested field LIST, and print accordingly */
+          for (struct field_range_pair* r = frp; r != last_frp ; ++r)
+            {
+              /* If open-ended range, print up to the available fields */
+              uintmax_t hi = (r->hi == UINTMAX_MAX) ? fld : r->hi;
+
+              for (uintmax_t i = r->lo - 1 ; i < hi ; ++i )
+                {
+#if 0
+                  fprintf(stderr,"Requested field: %zu\n", i);
+                  fprintf(stderr,"Field %zu: '%s'\n", i, fieldpos[i-1]);
+#endif
+
+                  if (i >=fld)
+                    break;
+
+                  if (!first)
+                    fwrite (output_delimiter_string, sizeof (char),
+                            output_delimiter_length, stdout);
+
+                  fputs (fieldpos[i], stdout);
+                  first = false;
+                }
+            }
+        }
+
+      /* Print non-delimited lines */
+      if (first && fld==1)
+        {
+          if (!suppress_non_delimited)
+            {
+              fputs(linebuf, stdout);
+              putchar (line_delim);
+            }
+          continue;
+        }
+
+      //fprintf(stderr,"end of line\n");
+      putchar (line_delim);
+
+    }
+
+  free (fieldpos);
+  free (linebuf);
+}
+
 static void
 cut_stream (FILE *stream)
 {
-  if (operating_mode == byte_mode)
+  if (adv_mode)
+    cut_adv (stream);
+  else if (operating_mode == byte_mode)
     cut_bytes (stream);
   else
     cut_fields (stream);
@@ -499,7 +633,7 @@ main (int argc, char **argv)
   delim = '\0';
   have_read_stdin = false;
 
-  while ((optc = getopt_long (argc, argv, "b:c:d:f:nsz", longopts, NULL)) != -1)
+  while ((optc = getopt_long (argc, argv, "b:c:d:Df:nsz", longopts, NULL)) != -1)
     {
       switch (optc)
         {
@@ -520,6 +654,11 @@ main (int argc, char **argv)
           spec_list_string = optarg;
           break;
 
+        case 'D':
+          adv_mode = true;
+          allow_duplicates = true;
+          break;
+
         case 'd':
           /* New delimiter. */
           /* Interpret -d '' to mean 'use the NUL byte as the delimiter.'  */
@@ -569,17 +708,29 @@ main (int argc, char **argv)
     FATAL_ERROR (_("an input delimiter may be specified only\
  when operating on fields"));
 
+  if (adv_mode && complement)
+    FATAL_ERROR (_("--complement cannot be used with -D"));
+
+  /* -D implies -s with -f */
+  if (allow_duplicates && operating_mode == field_mode)
+    suppress_non_delimited = true;
+
   if (suppress_non_delimited && operating_mode != field_mode)
     FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
 \tonly when operating on fields"));
 
   set_fields (spec_list_string,
               ( (operating_mode == field_mode) ? 0 : SETFLD_ERRMSG_USE_POS)
-              | (complement ? SETFLD_COMPLEMENT : 0) );
+              | (complement ? SETFLD_COMPLEMENT : 0)
+              | (allow_duplicates ? SETFLD_NO_SORT : 0) );
 
   if (!delim_specified)
     delim = '\t';
 
+  if (adv_mode && line_delim==delim)
+    FATAL_ERROR (_("line-delimiter must differ from field delimiter\
+ with -D"));
+
   if (output_delimiter_string == NULL)
     {
       static char dummy[2];
-- 
2.30.2

From cf4cf972196d9594ef8ca688d96ad3085e78bec6 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Wed, 5 Jan 2022 18:37:18 -0700
Subject: [PATCH 03/18] tests: add 'cut -D' tests

---
 tests/misc/cut.pl | 83 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/tests/misc/cut.pl b/tests/misc/cut.pl
index c93d73813..2b4e562f0 100755
--- a/tests/misc/cut.pl
+++ b/tests/misc/cut.pl
@@ -36,6 +36,10 @@ my $inval_pos = "$prog: invalid byte or character range\n$try";
 my $no_endpoint = "$prog: invalid range with no endpoint: -\n$try";
 my $nofield = "$prog: an input delimiter may be specified only when " .
               "operating on fields\n$try";
+my $complement_with_D = "$prog: --complement cannot be used " .
+               "with -D\n$try";
+my $line_field_delim_differ = "$prog: line-delimiter must differ from field " .
+               "delimiter with -D\n$try";
 
 my @Tests =
  (
@@ -227,6 +231,84 @@ my @Tests =
                                          {IN=>"123456\n"}, {OUT=>"1\n"}],
   ['EOL-subsumed-4', '--output-d=: -b1-2,2-3,3-',
                                         {IN=>"1234\n"}, {OUT=>"1234\n"}],
+
+
+  ##
+  ## Repeat some of the above tests for "-f", replacing it with "-D"
+  ## We expecte the exact same results (no overlapping or out-of-order
+  ## ranges in these tests)
+  ##
+  ['D-1', '-D -d:', '-f1,3-', {IN=>"a:b:c\n"}, {OUT=>"a:c\n"}],
+  ['D-2', '-D -d:', '-f1,3-', {IN=>"a:b:c\n"}, {OUT=>"a:c\n"}],
+  ['D-3', qw(-D -d: -f2-), {IN=>"a:b:c\n"}, {OUT=>"b:c\n"}],
+  ['D-4', qw(-D -d: -f4), {IN=>"a:b:c\n"}, {OUT=>"\n"}],
+  ['D-5', qw(-D -d: -f4), {IN=>""}, {OUT=>""}],
+  ['D-a', qw(-D -s -d:), '-f3-', {IN=>"a:b:c\n"}, {OUT=>"c\n"}],
+  ['D-b', qw(-D -s -d:), '-f2,3', {IN=>"a:b:c\n"}, {OUT=>"b:c\n"}],
+  ['D-c', qw(-D -s -d:), '-f1,3', {IN=>"a:b:c\n"}, {OUT=>"a:c\n"}],
+  # Trailing colon should not be output
+  ['D-d', qw(-D -s -d:), '-f1,3', {IN=>"a:b:c:\n"}, {OUT=>"a:c\n"}],
+  ['D-e', qw(-D -s -d:), '-f3-', {IN=>"a:b:c:\n"}, {OUT=>"c:\n"}],
+  ['D-f', qw(-D -s -d:), '-f3-4', {IN=>"a:b:c:\n"}, {OUT=>"c:\n"}],
+  ['D-g', qw(-D -s -d:), '-f3,4', {IN=>"a:b:c:\n"}, {OUT=>"c:\n"}],
+  # Make sure -s suppresses non-delimited lines
+  ['D-h', qw(-D -s -d:), '-f2,3', {IN=>"abc\n"}, {OUT=>""}],
+  #
+  ['D-i', qw(-D -d: -f1-3), {IN=>":::\n"}, {OUT=>"::\n"}],
+  ['D-j', qw(-D -d: -f1-4), {IN=>":::\n"}, {OUT=>":::\n"}],
+  ['D-k', qw(-D -d: -f2-3), {IN=>":::\n"}, {OUT=>":\n"}],
+  ['D-l', qw(-D -d: -f2-4), {IN=>":::\n"}, {OUT=>"::\n"}],
+  ['D-m', qw(-D -s -d: -f1-3), {IN=>":::\n"}, {OUT=>"::\n"}],
+  ['D-n', qw(-D -s -d: -f1-4), {IN=>":::\n"}, {OUT=>":::\n"}],
+  ['D-o', qw(-D -s -d: -f2-3), {IN=>":::\n"}, {OUT=>":\n"}],
+  ['D-p', qw(-D -s -d: -f2-4), {IN=>":::\n"}, {OUT=>"::\n"}],
+  ['D-q', qw(-D -s -d: -f2-4), {IN=>":::\n:\n"}, {OUT=>"::\n\n"}],
+  ['D-r', qw(-D -s -d: -f2-4), {IN=>":::\n:1\n"}, {OUT=>"::\n1\n"}],
+  ['D-s', qw(-D -s -d: -f1-4), {IN=>":::\n:a\n"}, {OUT=>":::\n:a\n"}],
+  ['D-t', qw(-D -s -d: -f3-), {IN=>":::\n:1\n"}, {OUT=>":\n\n"}],
+  # Make sure it handles empty input properly, with and without -s.
+  ['D-u', qw(-D -s -f3-), {IN=>""}, {OUT=>""}],
+  ['D-o-delim', qw(-D -d: --out=_), '-f2,3', {IN=>"a:b:c\n"}, {OUT=>"b_c\n"}],
+  ['D-nul-idelim', qw(-D -d '' --out=_), '-f2,3', {IN=>"a\0b\0c\n"}, {OUT=>"b_c\n"}],
+  ['D-nul-odelim', qw(-D -d: --out=), '-f2,3', {IN=>"a:b:c\n"}, {OUT=>"b\0c\n"}],
+  ['D-multichar-od', qw(-D -d: --out=_._), '-f2,3', {IN=>"a:b:c\n"},
+   {OUT=>"b_._c\n"}],
+  ['D-newline-2', '-D -f1-', {IN=>""}, {OUT=>""}],
+  ['D-newline-3', '-D -d:', '-f1', {IN=>"a:1\nb:2\n"}, {OUT=>"a\nb\n"}],
+  ['D-newline-4', '-D -d:', '-f1', {IN=>"a:1\nb:2"}, {OUT=>"a\nb\n"}],
+  ['D-newline-5', '-D -d:', '-f2', {IN=>"a:1\nb:2\n"}, {OUT=>"1\n2\n"}],
+  ['D-newline-6', '-D -d:', '-f2', {IN=>"a:1\nb:2"}, {OUT=>"1\n2\n"}],
+  ['D-newline-7', '-D -s', '-d:', '-f1', {IN=>"a:1\nb:2"}, {OUT=>"a\nb\n"}],
+  ['D-newline-8', '-D -s', '-d:', '-f1', {IN=>"a:1\nb:2\n"}, {OUT=>"a\nb\n"}],
+  ['D-newline-9', '-D -s', '-d:', '-f1', {IN=>"a1\nb2"}, {OUT=>""}],
+  ['D-newline-10', '-D -s', '-d:', '-f1,2', {IN=>"a:1\nb:2"}, {OUT=>"a:1\nb:2\n"}],
+  ['D-newline-11', '-D -s', '-d:', '-f1,2', {IN=>"a:1\nb:2\n"}, {OUT=>"a:1\nb:2\n"}],
+  ['D-newline-12', '-D -s', '-d:', '-f1', {IN=>"a:1\nb:"}, {OUT=>"a\nb\n"}],
+  ['D-newline-13', '-D -d:', '-f1-', {IN=>"a1:\n:"}, {OUT=>"a1:\n:\n"}],
+  ## 'newlines-14' to 'newline-26' are not supported - using '\n'
+  ## for both line and field delimiter.
+  ['D-zerot-3', '-D -z -f1-', {IN=>""}, {OUT=>""}],
+  ['D-zerot-4', '-D -z -d:', '-f1', {IN=>"a:1\0b:2"}, {OUT=>"a\0b\0"}],
+  ['D-zerot-5', '-D -z -d:', '-f1-', {IN=>"a1:\0:"}, {OUT=>"a1:\0:\0"}],
+  ## 'zerot-6' uses NUL for both line and field delimiter.
+
+
+  ##
+  ## Test "-D" with duplicated/out-of-order fields
+  ##
+  ['DD-1', '-D -d:', '-f2,3,3,1', {IN=>"a:b:c\n"}, {OUT=>"b:c:c:a\n"}],
+  ['DD-2', '-D -d:', '-f2-,1',    {IN=>"a:b:c\n"}, {OUT=>"b:c:a\n"}],
+  ['DD-3', '-D -d:', '-f1-,1',    {IN=>"a:b:c\n"}, {OUT=>"a:b:c:a\n"}],
+  ['DD-4', '-D -d:', '-f1,1-',    {IN=>"a:b:c\n"}, {OUT=>"a:a:b:c\n"}],
+  ['DD-5', '-D -d:', '-f-3,-2',   {IN=>"a:b:c\n"}, {OUT=>"a:b:c:a:b\n"}],
+  ['DD-6', '-D -d:', '-f-3,2-',   {IN=>"a:b:c:d\n"}, {OUT=>"a:b:c:b:c:d\n"}],
+
+  ## Check -D related errors
+  ['DD-err-2', '--complement -D -f2', {ERR=>$complement_with_D}, {EXIT => 1} ],
+  ['DD-err-3', "-D -f2 -d'\n'",  {ERR=>$line_field_delim_differ}, {EXIT => 1} ],
+  ['DD-err-4', "-D -f2 -d '' -z", {ERR=>$line_field_delim_differ}, {EXIT => 1} ],
+
+
  );
 
 if ($mb_locale ne 'C')
@@ -246,6 +328,7 @@ if ($mb_locale ne 'C')
   }
 
 
+
 @Tests = triple_test \@Tests;
 
 my $save_temps = $ENV{DEBUG};
-- 
2.30.2

From e07f1f41c6cd42069f4e31e08938e9de3587f319 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Thu, 6 Jan 2022 12:24:40 -0700
Subject: [PATCH 04/18] cut: extract 'cut -D -f' to a separate function

---
 src/cut.c | 170 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 93 insertions(+), 77 deletions(-)

diff --git a/src/cut.c b/src/cut.c
index 84caad091..369c47856 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -124,6 +124,9 @@ static bool adv_mode;
    and do not sort the output list */
 static bool allow_duplicates;
 
+/* Minor optimization: save a pointer to the last field pair sentinel
+   (which is always added by set_fields() */
+static struct field_range_pair *last_frp;
 
 /* For long options that have no equivalent short option, use a
    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
@@ -441,6 +444,87 @@ cut_fields (FILE *stream)
     }
 }
 
+static bool
+cut_adv_fields (char* linebuf, size_t len)
+{
+  static char **fieldpos = NULL ;
+  static idx_t alloc_flds = 0;
+
+  /* Split into fields */
+  char *p = linebuf;
+  size_t l = len;
+  idx_t fld = 0 ;
+  while (true)
+    {
+      char *endp = memchr (p, delim, l);
+
+      /* NUL-terminate the field if not the last */
+      if (endp)
+        *endp = '\0';
+
+      //fprintf(stderr,"Field %ld: '%s'\n", fld, p);
+
+      /* Store this field */
+      if (fld >= alloc_flds)
+        fieldpos = xpalloc (fieldpos, &alloc_flds, 10, -1, sizeof(char*));
+      fieldpos[fld] = p;
+      fld++;
+
+      if (!endp)
+        break;
+
+      l -= (endp-p+1);
+      p = endp+1;
+    }
+
+
+  bool output = false;
+
+  if (fld>1)
+    {
+      /* Iterate the requested field LIST, and print accordingly */
+      for (struct field_range_pair* r = frp; r != last_frp ; ++r)
+        {
+          /* If open-ended range, print up to the available fields */
+          uintmax_t hi = (r->hi == UINTMAX_MAX) ? fld : r->hi;
+
+          for (uintmax_t i = r->lo - 1 ; i < hi ; ++i )
+            {
+#if 0
+              fprintf(stderr,"Requested field: %zu\n", i);
+              fprintf(stderr,"Field %zu: '%s'\n", i, fieldpos[i-1]);
+#endif
+
+              if (i >=fld)
+                break;
+
+              if (output)
+                fwrite (output_delimiter_string, sizeof (char),
+                        output_delimiter_length, stdout);
+
+              fputs (fieldpos[i], stdout);
+              output = true;
+            }
+        }
+    }
+
+  /* Print non-delimited lines */
+  if (!output && fld==1)
+    {
+      if (!suppress_non_delimited)
+        {
+          fputs(linebuf, stdout);
+          output = true;
+        }
+    }
+
+  IF_LINT (free (fieldpos));
+  IF_LINT (fieldpos = NULL);
+  IF_LINT (alloc_flds = 0);
+
+  return output || fld>1;
+}
+
 static void
 cut_adv (FILE *stream)
 {
@@ -448,14 +532,6 @@ cut_adv (FILE *stream)
   size_t bufsize = 0;
   ssize_t len;
 
-  char **fieldpos = NULL ;
-  idx_t alloc_flds = 0;
-
-  /* Minor optimization: save a pointer to the last field pair sentinel
-     (which is always added by set_fields() */
-  struct field_range_pair *last_frp = frp;
-  while ( ! (last_frp->hi==UINTMAX_MAX && last_frp->lo==UINTMAX_MAX) )
-    ++last_frp;
 
   while (true)
     {
@@ -478,81 +554,15 @@ cut_adv (FILE *stream)
           --len;
         }
 
-      /* Split into fields */
-      char *p = linebuf;
-      size_t l = len;
-      idx_t fld = 0 ;
-      while (true)
-        {
-          char *endp = memchr (p, delim, l);
-
-          /* NUL-terminate the field if not the last */
-          if (endp)
-            *endp = '\0';
-
-          //fprintf(stderr,"Field %ld: '%s'\n", fld, p);
-
-          /* Store this field */
-          if (fld >= alloc_flds)
-            fieldpos = xpalloc (fieldpos, &alloc_flds, 10, -1, sizeof(char*));
-          fieldpos[fld] = p;
-          fld++;
-
-          if (!endp)
-            break;
-
-          l -= (endp-p+1);
-          p = endp+1;
-        }
-
 
-      bool first = true;
-
-      if (fld>1)
-        {
-          /* Iterate the requested field LIST, and print accordingly */
-          for (struct field_range_pair* r = frp; r != last_frp ; ++r)
-            {
-              /* If open-ended range, print up to the available fields */
-              uintmax_t hi = (r->hi == UINTMAX_MAX) ? fld : r->hi;
-
-              for (uintmax_t i = r->lo - 1 ; i < hi ; ++i )
-                {
-#if 0
-                  fprintf(stderr,"Requested field: %zu\n", i);
-                  fprintf(stderr,"Field %zu: '%s'\n", i, fieldpos[i-1]);
-#endif
-
-                  if (i >=fld)
-                    break;
-
-                  if (!first)
-                    fwrite (output_delimiter_string, sizeof (char),
-                            output_delimiter_length, stdout);
-
-                  fputs (fieldpos[i], stdout);
-                  first = false;
-                }
-            }
-        }
-
-      /* Print non-delimited lines */
-      if (first && fld==1)
-        {
-          if (!suppress_non_delimited)
-            {
-              fputs(linebuf, stdout);
-              putchar (line_delim);
-            }
-          continue;
-        }
+      bool output = cut_adv_fields (linebuf, len);
 
       //fprintf(stderr,"end of line\n");
-      putchar (line_delim);
+      if (output)
+        putchar (line_delim);
 
     }
 
-  free (fieldpos);
   free (linebuf);
 }
 
@@ -724,6 +734,12 @@ main (int argc, char **argv)
               | (complement ? SETFLD_COMPLEMENT : 0)
               | (allow_duplicates ? SETFLD_NO_SORT : 0) );
 
+  /* Minor optimization: keep a pointer to the sentinel (last) pair */
+  last_frp = frp;
+  while ( ! (last_frp->hi==UINTMAX_MAX && last_frp->lo==UINTMAX_MAX) )
+    ++last_frp;
+
+
   if (!delim_specified)
     delim = '\t';
 
-- 
2.30.2

From 5091a2750a4cdd603816d8eccec03e440ab4af64 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Thu, 6 Jan 2022 12:35:58 -0700
Subject: [PATCH 05/18] cut: implement -D with -b

---
 src/cut.c | 38 +++++++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/src/cut.c b/src/cut.c
index 369c47856..ed2e903ab 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -525,6 +525,36 @@ cut_adv_fields (char* linebuf, size_t len)
   return output || fld>1;
 }
 
+
+static bool
+cut_adv_bytes (char* linebuf, size_t len)
+{
+  bool output = false;
+
+  /* Iterate the requested field LIST, and print accordingly */
+  for (struct field_range_pair* r = frp; r != last_frp ; ++r)
+    {
+      /* If open-ended range, print up to the available fields */
+      uintmax_t hi = (r->hi == UINTMAX_MAX) ? len : r->hi;
+
+      if (output_delimiter_specified && output)
+        fwrite (output_delimiter_string, sizeof (char),
+                output_delimiter_length, stdout);
+
+      for (uintmax_t i = r->lo - 1 ; i < hi ; ++i )
+        {
+          if (i >=len)
+            break;
+
+          putchar (linebuf[i]);
+
+          output = true;
+        }
+    }
+
+  return true;
+}
+
 static void
 cut_adv (FILE *stream)
 {
@@ -555,12 +585,14 @@ cut_adv (FILE *stream)
         }
 
 
-      bool output = cut_adv_fields (linebuf, len);
+      bool output ;
+      if (operating_mode == byte_mode)
+        output = cut_adv_bytes (linebuf, len);
+      else
+        output = cut_adv_fields (linebuf, len);
 
-      //fprintf(stderr,"end of line\n");
       if (output)
         putchar (line_delim);
-
     }
 
   free (linebuf);
-- 
2.30.2

From 6b306fcd2c3286822d68180b0c0634118df5a112 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Thu, 6 Jan 2022 12:45:37 -0700
Subject: [PATCH 06/18] tests: add 'cut -D -b' tests

---
 tests/misc/cut.pl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/misc/cut.pl b/tests/misc/cut.pl
index 2b4e562f0..e644963f7 100755
--- a/tests/misc/cut.pl
+++ b/tests/misc/cut.pl
@@ -308,6 +308,14 @@ my @Tests =
   ['DD-err-3', "-D -f2 -d'\n'",  {ERR=>$line_field_delim_differ}, {EXIT => 1} ],
   ['DD-err-4', "-D -f2 -d '' -z", {ERR=>$line_field_delim_differ}, {EXIT => 1} ],
 
+  ##
+  ## Check "-D" with "-b/-c"
+  ##
+  ['DB-out-delim1', '-D -c1-3,5-', '--output-d=:', {IN=>"abcdefg\n"},
+   {OUT=>"abc:efg\n"}],
+  # A totally overlapped field WITH "-D" does change the output:
+  ['DB-out-delim2', '-D -c1-3,2,5-', '--output-d=:', {IN=>"abcdefg\n"},
+   {OUT=>"abc:b:efg\n"}],
 
  );
 
-- 
2.30.2

From 1d968471c01b3cbef1aa3c04f3040c7dbb63627c Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Thu, 6 Jan 2022 12:51:03 -0700
Subject: [PATCH 07/18] cut: add -O short-option for --output-delimiter

---
 src/cut.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/cut.c b/src/cut.c
index ed2e903ab..4e86953d3 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -132,8 +132,7 @@ static struct field_range_pair *last_frp;
    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
 enum
 {
-  OUTPUT_DELIMITER_OPTION = CHAR_MAX + 1,
-  COMPLEMENT_OPTION
+  COMPLEMENT_OPTION = CHAR_MAX + 1,
 };
 
 static struct option const longopts[] =
@@ -144,7 +143,7 @@ static struct option const longopts[] =
   {"delimiter", required_argument, NULL, 'd'},
   {"allow-duplicates", required_argument, NULL, 'D'},
   {"only-delimited", no_argument, NULL, 's'},
-  {"output-delimiter", required_argument, NULL, OUTPUT_DELIMITER_OPTION},
+  {"output-delimiter", required_argument, NULL, 'O'},
   {"complement", no_argument, NULL, COMPLEMENT_OPTION},
   {"zero-terminated", no_argument, NULL, 'z'},
   {GETOPT_HELP_OPTION_DECL},
@@ -191,7 +190,7 @@ Print selected parts of lines from each FILE to standard output.\n\
 "), stdout);
       fputs (_("\
   -s, --only-delimited    do not print lines not containing delimiters\n\
-      --output-delimiter=STRING  use STRING as the output delimiter\n\
+  -O, --output-delimiter=STRING  use STRING as the output delimiter\n\
                             the default is to use the input delimiter\n\
 "), stdout);
       fputs (_("\
@@ -675,7 +674,7 @@ main (int argc, char **argv)
   delim = '\0';
   have_read_stdin = false;
 
-  while ((optc = getopt_long (argc, argv, "b:c:d:Df:nsz", longopts, NULL)) != -1)
+  while ((optc = getopt_long (argc, argv, "b:c:d:Df:nO:sz", longopts, NULL)) != -1)
     {
       switch (optc)
         {
@@ -710,7 +709,7 @@ main (int argc, char **argv)
           delim_specified = true;
           break;
 
-        case OUTPUT_DELIMITER_OPTION:
+        case 'O':
           output_delimiter_specified = true;
           /* Interpret --output-delimiter='' to mean
              'use the NUL byte as the delimiter.'  */
-- 
2.30.2

From 7cdb369a51c5f716fd727760a18c220540a617d8 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Thu, 6 Jan 2022 15:36:43 -0700
Subject: [PATCH 08/18] cut: implement -F

---
 src/cut.c | 124 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 117 insertions(+), 7 deletions(-)

diff --git a/src/cut.c b/src/cut.c
index 4e86953d3..7da0c131f 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -32,10 +32,12 @@
 #include <sys/types.h>
 #include "system.h"
 
+#include "die.h"
 #include "error.h"
 #include "fadvise.h"
 #include "getndelim2.h"
 #include "hash.h"
+#include "regex.h"
 
 #include "set-fields.h"
 
@@ -128,6 +130,14 @@ static bool allow_duplicates;
    (which is always added by set_fields() */
 static struct field_range_pair *last_frp;
 
+/* With "-F", the input delimiter (-d) can be a regex string, not
+   just a single character. Keep the string here. */
+static char* delim_str;
+
+/* With "-F", this is the compiled regex */
+static bool delim_use_regex;
+static struct re_pattern_buffer delim_regex;
+
 /* For long options that have no equivalent short option, use a
    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
 enum
@@ -140,6 +150,7 @@ static struct option const longopts[] =
   {"bytes", required_argument, NULL, 'b'},
   {"characters", required_argument, NULL, 'c'},
   {"fields", required_argument, NULL, 'f'},
+  {"regex-fields", required_argument, NULL, 'F'},
   {"delimiter", required_argument, NULL, 'd'},
   {"allow-duplicates", required_argument, NULL, 'D'},
   {"only-delimited", no_argument, NULL, 's'},
@@ -183,6 +194,10 @@ Print selected parts of lines from each FILE to standard output.\n\
                             that contains no delimiter character, unless\n\
                             the -s option is specified\n\
   -n                      (ignored)\n\
+"), stdout);
+      fputs (_("\
+  -F, --regex-fields=LIST select only these fields; treat -d DELIM as a\n\
+ regular expression delimiter\n\
 "), stdout);
       fputs (_("\
       --complement        complement the set of selected bytes, characters\n\
@@ -200,7 +215,7 @@ Print selected parts of lines from each FILE to standard output.\n\
       fputs (VERSION_OPTION_DESCRIPTION, stdout);
       fputs (_("\
 \n\
-Use one, and only one of -b, -c or -f.  Each LIST is made up of one\n\
+Use one, and only one of -b, -c, -f or -F.  Each LIST is made up of one\n\
 range, or many ranges separated by commas.  Selected input is written\n\
 in the same order that it is read, and is written exactly once.\n\
 "), stdout);
@@ -451,11 +466,56 @@ cut_adv_fields (char* linebuf, size_t len)
 
   /* Split into fields */
   char *p = linebuf;
+  char *endp ;
   size_t l = len;
   idx_t fld = 0 ;
   while (true)
     {
-      char *endp = memchr (p, delim, l);
+      if (delim_use_regex)
+        {
+          #if 0
+          fprintf(stderr,"Running regex exec, beg = '%c'\n", *p);
+          #endif
+
+          struct re_registers regs;
+          memset (&regs, 0, sizeof regs);
+          regoff_t i = re_search (&delim_regex, p, l, 0, l, &regs);
+
+          if (i == -2)
+            FATAL_ERROR (_("regex search failed"));
+
+          #if 0
+          fprintf(stderr,"re_search returned %ld, num-reg = %zu\n", i, regs.num_regs);
+          for (int j=0;j<regs.num_regs;++j)
+            {
+              regoff_t s = regs.start[j];
+              regoff_t e = regs.end[j];
+              fprintf(stderr,"   reg[%ld].start = '%c'  end = '%c'\n", j, *(p+s), *(p+e)) ;
+            }
+          #endif
+
+          if (i >= 0)
+            {
+              /* The matched regex register is the location of the
+                 delimiting string.  Add NUL at the start (to
+                 terminate the preceeding field) and set ENDP to the
+                 end of it (one octet before the next field) */
+              const regoff_t s = regs.start[0];
+              const regoff_t e = regs.end[0];
+              *(p+s) = '\0';
+              endp = p+e-1;
+            }
+          else
+            {
+              endp = 0;
+            }
+          free (regs.start);
+          free (regs.end);
+        }
+      else
+        {
+         endp = memchr (p, delim, l);
+        }
 
       /* NUL-terminate the field if not the last */
       if (endp)
@@ -674,7 +734,7 @@ main (int argc, char **argv)
   delim = '\0';
   have_read_stdin = false;
 
-  while ((optc = getopt_long (argc, argv, "b:c:d:Df:nO:sz", longopts, NULL)) != -1)
+  while ((optc = getopt_long (argc, argv, "b:c:d:Df:F:nO:sz", longopts, NULL)) != -1)
     {
       switch (optc)
         {
@@ -695,6 +755,16 @@ main (int argc, char **argv)
           spec_list_string = optarg;
           break;
 
+        case 'F':
+          /* Build the field list. */
+          if (operating_mode != undefined_mode)
+            FATAL_ERROR (_("only one type of list may be specified"));
+          operating_mode = field_mode;
+          adv_mode = true;
+          spec_list_string = optarg;
+          delim_use_regex = true;
+          break;
+
         case 'D':
           adv_mode = true;
           allow_duplicates = true;
@@ -702,10 +772,7 @@ main (int argc, char **argv)
 
         case 'd':
           /* New delimiter. */
-          /* Interpret -d '' to mean 'use the NUL byte as the delimiter.'  */
-          if (optarg[0] != '\0' && optarg[1] != '\0')
-            FATAL_ERROR (_("the delimiter must be a single character"));
-          delim = optarg[0];
+          delim_str = xstrdup (optarg);
           delim_specified = true;
           break;
 
@@ -742,6 +809,49 @@ main (int argc, char **argv)
         }
     }
 
+  if (operating_mode == field_mode && delim_use_regex && !delim_specified)
+    {
+      /* Default delimiter for -F (regex delimiter) is whitespace */
+      delim_str = xstrdup("[ \t]+");
+      delim_specified = true;
+    }
+
+  /* '-d DELIM' validation */
+  if (delim_specified)
+    {
+      if (operating_mode == field_mode && delim_use_regex)
+        {
+          /* in -F/--regex-field mode, DELIM can be a non-empty string and
+             a valid regex. */
+          if (strlen (delim_str)==0)
+            FATAL_ERROR (_("delimiter string must not be empty with -F"));
+
+          /* FIXME: What are the correct flags compared to busybox/toybox? */
+          re_set_syntax (RE_SYNTAX_EGREP | RE_HAT_LISTS_NOT_NEWLINE | RE_ICASE);
+          memset (&delim_regex, 0, sizeof delim_regex);
+          const char *s = re_compile_pattern (delim_str, strlen (delim_str), &delim_regex);
+          if (s)
+            die (EXIT_FAILURE, 0, _("regex error: %s"), s);
+
+          /* Default output delimiter is one space */
+          if (!output_delimiter_specified)
+            {
+              output_delimiter_specified = true;
+              output_delimiter_string = xstrdup (" ");
+              output_delimiter_length = 1 ;
+            }
+
+        }
+      else
+        {
+          /* Interpret -d '' to mean 'use the NUL byte as the delimiter.'  */
+          if (delim_str[0] != '\0' && delim_str[1] != '\0')
+            FATAL_ERROR (_("the delimiter must be a single character"));
+          delim = delim_str[0];
+        }
+    }
+
+
   if (operating_mode == undefined_mode)
     FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
 
-- 
2.30.2

From 84613ae0f584a5a13d0df9e8211df09d714b678d Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Thu, 6 Jan 2022 15:48:07 -0700
Subject: [PATCH 09/18] tests: add 'cut -F' tests

---
 tests/misc/cut.pl | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/misc/cut.pl b/tests/misc/cut.pl
index e644963f7..79ef49d40 100755
--- a/tests/misc/cut.pl
+++ b/tests/misc/cut.pl
@@ -317,6 +317,22 @@ my @Tests =
   ['DB-out-delim2', '-D -c1-3,2,5-', '--output-d=:', {IN=>"abcdefg\n"},
    {OUT=>"abc:b:efg\n"}],
 
+  ##
+  ## Check "-F" (regex delimiter)
+  ##
+  ['RE-1', "-d [0-9]+ -F3,1,2,1", {IN=>"abc123def456efg\n"},
+   {OUT=>"abc def efg\n"}],
+  ['RE-2', "-d [0-9]+ -F3,1,2,1 -D", {IN=>"abc123def456efg\n"},
+   {OUT=>"efg abc def abc\n"}],
+  ['RE-3', "-d [0-9]+ -F3,1,2,1 -D -O:", {IN=>"abc123def456efg\n"},
+   {OUT=>"efg:abc:def:abc\n"}],
+  ['RE-4', " -F3,1,2,1 -D", {IN=>"abc \t def\t\tefg\n"},
+   {OUT=>"efg abc def abc\n"}],
+  ['RE-5', " -F3,1,2,1 -D -O:", {IN=>"abc \t def\t\tefg\n"},
+   {OUT=>"efg:abc:def:abc\n"}],
+  ['RE-6', " -F3,1,2,1 -O:", {IN=>"abc \t def\t\tefg\n"},
+   {OUT=>"abc:def:efg\n"}],
+
  );
 
 if ($mb_locale ne 'C')
-- 
2.30.2

From 6dd655e72c648b8afa0f5affa5115d58ef19977a Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Mon, 24 Jan 2022 21:11:21 -0700
Subject: [PATCH 10/18] cut: extract cut-fields into separate functions

---
 src/cut.c | 172 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 107 insertions(+), 65 deletions(-)

diff --git a/src/cut.c b/src/cut.c
index 7da0c131f..bed8f4cae 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -138,6 +138,17 @@ static char* delim_str;
 static bool delim_use_regex;
 static struct re_pattern_buffer delim_regex;
 
+struct field_pos
+{
+  char* pos;
+  size_t len;
+};
+static struct field_pos *field_pos;
+static idx_t field_alloc;
+static idx_t field_count;
+
+
+
 /* For long options that have no equivalent short option, use a
    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
 enum
@@ -458,94 +469,127 @@ cut_fields (FILE *stream)
     }
 }
 
-static bool
-cut_adv_fields (char* linebuf, size_t len)
+static void
+split_adv_fields_regex (char* linebuf, size_t len)
 {
-  static char **fieldpos = NULL ;
-  static idx_t alloc_flds = 0;
-
-  /* Split into fields */
   char *p = linebuf;
-  char *endp ;
   size_t l = len;
-  idx_t fld = 0 ;
-  while (true)
+  field_count = 0;
+
+  while (p)
     {
-      if (delim_use_regex)
-        {
-          #if 0
-          fprintf(stderr,"Running regex exec, beg = '%c'\n", *p);
-          #endif
+#if 0
+      fprintf(stderr,"Running regex exec, beg = '%c'\n", *p);
+#endif
 
-          struct re_registers regs;
-          memset (&regs, 0, sizeof regs);
-          regoff_t i = re_search (&delim_regex, p, l, 0, l, &regs);
+      struct re_registers regs;
+      memset (&regs, 0, sizeof regs);
+      regoff_t i = re_search (&delim_regex, p, l, 0, l, &regs);
 
-          if (i == -2)
-            FATAL_ERROR (_("regex search failed"));
+      if (i == -2)
+        FATAL_ERROR (_("regex search failed"));
 
-          #if 0
-          fprintf(stderr,"re_search returned %ld, num-reg = %zu\n", i, regs.num_regs);
-          for (int j=0;j<regs.num_regs;++j)
-            {
-              regoff_t s = regs.start[j];
-              regoff_t e = regs.end[j];
-              fprintf(stderr,"   reg[%ld].start = '%c'  end = '%c'\n", j, *(p+s), *(p+e)) ;
-            }
-          #endif
+#if 0
+      fprintf(stderr,"re_search returned %ld, num-reg = %zu\n", i, regs.num_regs);
+      for (int j=0;j<regs.num_regs;++j)
+        {
+          regoff_t s = regs.start[j];
+          regoff_t e = regs.end[j];
+          fprintf(stderr,"   reg[%ld].start = '%c'  end = '%c'\n", j, *(p+s), *(p+e)) ;
+        }
+#endif
 
-          if (i >= 0)
-            {
-              /* The matched regex register is the location of the
-                 delimiting string.  Add NUL at the start (to
-                 terminate the preceeding field) and set ENDP to the
-                 end of it (one octet before the next field) */
-              const regoff_t s = regs.start[0];
-              const regoff_t e = regs.end[0];
-              *(p+s) = '\0';
-              endp = p+e-1;
-            }
-          else
-            {
-              endp = 0;
-            }
-          free (regs.start);
-          free (regs.end);
+      /* Store this field */
+      if (field_count >= field_alloc)
+        field_pos = xpalloc (field_pos, &field_alloc, 10, -1, sizeof(struct field_pos));
+      field_pos[field_count].pos = p;
+
+      if (i >= 0)
+        {
+          /* The matched regex register is the location of the
+             delimiting string.  Add NUL at the start (to
+             terminate the preceeding field) and set ENDP to the
+             end of it (one octet before the next field) */
+          const regoff_t s = regs.start[0];
+          field_pos[field_count].len = s;
+
+          const regoff_t e = regs.end[0];
+          p += e;
+          l -= e;
         }
       else
         {
-         endp = memchr (p, delim, l);
+          field_pos[field_count].len = l;
+          p = NULL;
         }
 
-      /* NUL-terminate the field if not the last */
-      if (endp)
-        *endp = '\0';
+      free (regs.start);
+      free (regs.end);
+
+#if 0
+      fprintf(stderr,"(Regex)Field %ld: %d bytes = '%.*s'\n", field_count,
+              (int)field_pos[field_count].len, (int)field_pos[field_count].len,
+              field_pos[field_count].pos);
+#endif
+      field_count++;
+    }
+}
 
-      //fprintf(stderr,"Field %ld: '%s'\n", fld, p);
+static void
+split_adv_fields_delim (char* linebuf, size_t len)
+{
+  char *p = linebuf;
+  char *endp;
+  size_t l = len;
+  field_count = 0;
+
+  while (true)
+    {
+      endp = memchr (p, delim, l);
 
       /* Store this field */
-      if (fld >= alloc_flds)
-        fieldpos = xpalloc (fieldpos, &alloc_flds, 10, -1, sizeof(char*));
-      fieldpos[fld] = p;
-      fld++;
+      if (field_count >= field_alloc)
+        field_pos = xpalloc (field_pos, &field_alloc, 10, -1, sizeof(struct field_pos));
+      field_pos[field_count].pos = p;
+
+      idx_t fl = (endp) ? (endp-p) : l;
+      field_pos[field_count].len = fl ;
+
+#if 0
+      fprintf(stderr,"(Delim)Field %ld: %d bytes = '%.*s'\n", field_count,
+              (int)field_pos[field_count].len, (int)field_pos[field_count].len,
+              field_pos[field_count].pos);
+#endif
+      field_count++;
 
       if (!endp)
         break;
 
-      l -= (endp-p+1);
+      l -= fl + 1;
       p = endp+1;
     }
+}
+
+static bool
+cut_adv_fields (char* linebuf, size_t len)
+{
+  /* Split into fields */
+  if (delim_use_regex)
+    split_adv_fields_regex (linebuf, len);
+  else
+    split_adv_fields_delim (linebuf, len);
 
 
+  /* Print the fields */
   bool output = false;
 
-  if (fld>1)
+  if (field_count>1)
     {
       /* Iterate the requested field LIST, and print accordingly */
       for (struct field_range_pair* r = frp; r != last_frp ; ++r)
         {
           /* If open-ended range, print up to the available fields */
-          uintmax_t hi = (r->hi == UINTMAX_MAX) ? fld : r->hi;
+          uintmax_t hi = (r->hi == UINTMAX_MAX) ? field_count : r->hi;
 
           for (uintmax_t i = r->lo - 1 ; i < hi ; ++i )
             {
@@ -554,21 +598,22 @@ cut_adv_fields (char* linebuf, size_t len)
               fprintf(stderr,"Field %zu: '%s'\n", i, fieldpos[i-1]);
 #endif
 
-              if (i >=fld)
+              if (i >=field_count)
                 break;
 
               if (output)
                 fwrite (output_delimiter_string, sizeof (char),
                         output_delimiter_length, stdout);
 
-              fputs (fieldpos[i], stdout);
+              //fputs (fieldpos[i], stdout);
+              fwrite (field_pos[i].pos, sizeof (char), field_pos[i].len, stdout);
               output = true;
             }
         }
     }
 
   /* Print non-delimited lines */
-  if (!output && fld==1)
+  if (!output && field_count==1)
     {
       if (!suppress_non_delimited)
         {
@@ -577,11 +622,7 @@ cut_adv_fields (char* linebuf, size_t len)
         }
     }
 
-  IF_LINT (free (fieldpos));
-  IF_LINT (fieldpos = NULL);
-  IF_LINT (alloc_flds = 0);
-
-  return output || fld>1;
+  return output || field_count>1;
 }
 
 
@@ -911,6 +952,7 @@ main (int argc, char **argv)
     }
 
   IF_LINT (reset_fields ());
+  IF_LINT (free (field_pos));
 
   return ok ? EXIT_SUCCESS : EXIT_FAILURE;
 }
-- 
2.30.2

From 1c00839ff91d5f0315c7b2125f911ed50212c7db Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Sat, 22 Jan 2022 09:21:50 -0700
Subject: [PATCH 11/18] cut: implement multibyte -c/--characters

---
 src/cut.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 79 insertions(+), 7 deletions(-)

diff --git a/src/cut.c b/src/cut.c
index bed8f4cae..459af3296 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -22,7 +22,7 @@
 
    Rewrite cut_fields and cut_bytes -- Jim Meyering.
 
-   Match toybox's -D,-F,-O options -- Assaf Gordon. */
+   Match toybox's -D,-F,-O options and add multibyte supprt -- Assaf Gordon. */
 
 #include <config.h>
 
@@ -37,6 +37,7 @@
 #include "fadvise.h"
 #include "getndelim2.h"
 #include "hash.h"
+#include "mbuiter.h"
 #include "regex.h"
 
 #include "set-fields.h"
@@ -85,7 +86,10 @@ enum operating_mode
     byte_mode,
 
     /* Output the given delimiter-separated fields. */
-    field_mode
+    field_mode,
+
+    /* Output multibyte characters. */
+    mbchar_mode,
   };
 
 static enum operating_mode operating_mode;
@@ -140,7 +144,7 @@ static struct re_pattern_buffer delim_regex;
 
 struct field_pos
 {
-  char* pos;
+  const char* pos;
   size_t len;
 };
 static struct field_pos *field_pos;
@@ -570,6 +574,29 @@ split_adv_fields_delim (char* linebuf, size_t len)
     }
 }
 
+static void
+split_adv_mbchars (char* linebuf, size_t len)
+{
+  field_count = 0;
+
+  mbui_iterator_t iter;
+  for (mbui_init (iter, linebuf); mbui_avail (iter); mbui_advance (iter))
+    {
+      /* Store this field */
+      if (field_count >= field_alloc)
+        field_pos = xpalloc (field_pos, &field_alloc, 10, -1, sizeof(struct field_pos));
+      field_pos[field_count].pos = mbui_cur_ptr (iter);
+      field_pos[field_count].len = mb_len (mbui_cur (iter));
+
+#if 0
+      fprintf(stderr,"(mbchar)char %ld: %d bytes = '%.*s'\n", field_count,
+              (int)field_pos[field_count].len, (int)field_pos[field_count].len,
+              field_pos[field_count].pos);
+#endif
+      field_count++;
+    }
+}
+
 static bool
 cut_adv_fields (char* linebuf, size_t len)
 {
@@ -655,6 +682,36 @@ cut_adv_bytes (char* linebuf, size_t len)
   return true;
 }
 
+static bool
+cut_adv_mbchars (char* linebuf, size_t len)
+{
+  bool output = false;
+
+  split_adv_mbchars (linebuf, len);
+
+  /* Iterate the requested field LIST, and print accordingly */
+  for (struct field_range_pair* r = frp; r != last_frp ; ++r)
+    {
+      /* If open-ended range, print up to the available fields */
+      uintmax_t hi = (r->hi == UINTMAX_MAX) ? len : r->hi;
+
+      if (output_delimiter_specified && output)
+        fwrite (output_delimiter_string, sizeof (char),
+                output_delimiter_length, stdout);
+
+      for (uintmax_t i = r->lo - 1 ; i < hi ; ++i )
+        {
+          if (i >=len)
+            break;
+
+          fwrite (field_pos[i].pos, sizeof (char), field_pos[i].len, stdout);
+          output = true;
+        }
+    }
+
+  return true;
+}
+
 static void
 cut_adv (FILE *stream)
 {
@@ -688,6 +745,8 @@ cut_adv (FILE *stream)
       bool output ;
       if (operating_mode == byte_mode)
         output = cut_adv_bytes (linebuf, len);
+      else if (operating_mode == mbchar_mode)
+        output = cut_adv_mbchars (linebuf, len);
       else
         output = cut_adv_fields (linebuf, len);
 
@@ -780,7 +839,6 @@ main (int argc, char **argv)
       switch (optc)
         {
         case 'b':
-        case 'c':
           /* Build the byte list. */
           if (operating_mode != undefined_mode)
             FATAL_ERROR (_("only one type of list may be specified"));
@@ -788,6 +846,14 @@ main (int argc, char **argv)
           spec_list_string = optarg;
           break;
 
+        case 'c':
+          /* Build the char list. */
+          if (operating_mode != undefined_mode)
+            FATAL_ERROR (_("only one type of list may be specified"));
+          operating_mode = mbchar_mode;
+          spec_list_string = optarg;
+          break;
+
         case 'f':
           /* Build the field list. */
           if (operating_mode != undefined_mode)
@@ -801,13 +867,11 @@ main (int argc, char **argv)
           if (operating_mode != undefined_mode)
             FATAL_ERROR (_("only one type of list may be specified"));
           operating_mode = field_mode;
-          adv_mode = true;
           spec_list_string = optarg;
           delim_use_regex = true;
           break;
 
         case 'D':
-          adv_mode = true;
           allow_duplicates = true;
           break;
 
@@ -900,13 +964,21 @@ main (int argc, char **argv)
     FATAL_ERROR (_("an input delimiter may be specified only\
  when operating on fields"));
 
-  if (adv_mode && complement)
+  if (allow_duplicates && complement)
     FATAL_ERROR (_("--complement cannot be used with -D"));
 
   /* -D implies -s with -f */
   if (allow_duplicates && operating_mode == field_mode)
     suppress_non_delimited = true;
 
+  /* -c in single-byte locale is equivalent to -b (which is faster) */
+  if ( (operating_mode == mbchar_mode) && (MB_CUR_MAX==1) )
+    operating_mode = byte_mode;
+
+  /* Use the more feature-rich (but slower) code path? */
+  adv_mode = allow_duplicates || delim_use_regex
+    || (operating_mode == mbchar_mode);
+
   if (suppress_non_delimited && operating_mode != field_mode)
     FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
 \tonly when operating on fields"));
-- 
2.30.2

From 55ec8cfeb59ffd892d89c0f754efbd8aa7eafb2e Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Mon, 24 Jan 2022 22:57:33 -0700
Subject: [PATCH 12/18] cut: change -F regex syntax to BRE

---
 src/cut.c         | 6 +++---
 tests/misc/cut.pl | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/cut.c b/src/cut.c
index 459af3296..c32b9cd14 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -917,7 +917,7 @@ main (int argc, char **argv)
   if (operating_mode == field_mode && delim_use_regex && !delim_specified)
     {
       /* Default delimiter for -F (regex delimiter) is whitespace */
-      delim_str = xstrdup("[ \t]+");
+      delim_str = xstrdup("[ \t][ \t]*");
       delim_specified = true;
     }
 
@@ -931,8 +931,8 @@ main (int argc, char **argv)
           if (strlen (delim_str)==0)
             FATAL_ERROR (_("delimiter string must not be empty with -F"));
 
-          /* FIXME: What are the correct flags compared to busybox/toybox? */
-          re_set_syntax (RE_SYNTAX_EGREP | RE_HAT_LISTS_NOT_NEWLINE | RE_ICASE);
+          /* Regex syntax: match other coreutils programs (expr,nl,csplit) */
+          re_set_syntax (RE_SYNTAX_POSIX_BASIC & ~RE_CONTEXT_INVALID_DUP & ~RE_NO_EMPTY_RANGES);
           memset (&delim_regex, 0, sizeof delim_regex);
           const char *s = re_compile_pattern (delim_str, strlen (delim_str), &delim_regex);
           if (s)
diff --git a/tests/misc/cut.pl b/tests/misc/cut.pl
index 79ef49d40..3bff6e4a5 100755
--- a/tests/misc/cut.pl
+++ b/tests/misc/cut.pl
@@ -320,11 +320,11 @@ my @Tests =
   ##
   ## Check "-F" (regex delimiter)
   ##
-  ['RE-1', "-d [0-9]+ -F3,1,2,1", {IN=>"abc123def456efg\n"},
+  ['RE-1', "-d '[0-9][0-9]*' -F3,1,2,1", {IN=>"abc123def456efg\n"},
    {OUT=>"abc def efg\n"}],
-  ['RE-2', "-d [0-9]+ -F3,1,2,1 -D", {IN=>"abc123def456efg\n"},
+  ['RE-2', "-d '[0-9][0-9]*' -F3,1,2,1 -D", {IN=>"abc123def456efg\n"},
    {OUT=>"efg abc def abc\n"}],
-  ['RE-3', "-d [0-9]+ -F3,1,2,1 -D -O:", {IN=>"abc123def456efg\n"},
+  ['RE-3', "-d '[0-9][0-9]*' -F3,1,2,1 -D -O:", {IN=>"abc123def456efg\n"},
    {OUT=>"efg:abc:def:abc\n"}],
   ['RE-4', " -F3,1,2,1 -D", {IN=>"abc \t def\t\tefg\n"},
    {OUT=>"efg abc def abc\n"}],
-- 
2.30.2

From dbf9e94b7bb37fa6df65dd3af896f46b3f0ff9bc Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Mon, 24 Jan 2022 22:59:10 -0700
Subject: [PATCH 13/18] cut: change -D long-option equivalent

---
 src/cut.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cut.c b/src/cut.c
index c32b9cd14..7474b38fd 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -167,7 +167,7 @@ static struct option const longopts[] =
   {"fields", required_argument, NULL, 'f'},
   {"regex-fields", required_argument, NULL, 'F'},
   {"delimiter", required_argument, NULL, 'd'},
-  {"allow-duplicates", required_argument, NULL, 'D'},
+  {"definitive-list", required_argument, NULL, 'D'},
   {"only-delimited", no_argument, NULL, 's'},
   {"output-delimiter", required_argument, NULL, 'O'},
   {"complement", no_argument, NULL, COMPLEMENT_OPTION},
@@ -201,7 +201,7 @@ Print selected parts of lines from each FILE to standard output.\n\
   -d, --delimiter=DELIM   use DELIM instead of TAB for field delimiter\n\
 "), stdout);
       fputs (_("\
-  -D, --allow-duplicates  keep duplicated bytes/charaters/fields in LIST;\n\
+  -D, --definitive-list   keep duplicated bytes/charaters/fields in LIST;\n\
                             do not sort LIST; implies -s\n\
 "), stdout);
       fputs (_("\
-- 
2.30.2

From fc9e6bf9562c489ad6985a5cacd747a6bc11472d Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Mon, 24 Jan 2022 23:14:18 -0700
Subject: [PATCH 14/18] doc: mention 'cut -D' in NEWS

---
 NEWS | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/NEWS b/NEWS
index b453f01ad..769a03fd5 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,14 @@ GNU coreutils NEWS                                    -*- outline -*-
 
 * Noteworthy changes in release ?.? (????-??-??) [?]
 
+** New Features
+
+  cut now supports -D/--definitive-list option, to print the requested
+  bytes/characters/fields LIST exactly as-is, without sorting or removing
+  duplicates. That is, 'cut -D -b3,1,1' will print the 3rd byte followed by
+  the 1st byte, twice.
+
+
 ** Bug fixes
 
   chmod -R no longer exits with error status when encountering symlinks.
-- 
2.30.2

From 20f065765ba24af25de4caeffe25cd83e6831c4d Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Mon, 24 Jan 2022 23:21:32 -0700
Subject: [PATCH 15/18] doc: mention 'cut -F' in NEWS

---
 NEWS | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/NEWS b/NEWS
index 769a03fd5..e90943d59 100644
--- a/NEWS
+++ b/NEWS
@@ -9,6 +9,11 @@ GNU coreutils NEWS                                    -*- outline -*-
   duplicates. That is, 'cut -D -b3,1,1' will print the 3rd byte followed by
   the 1st byte, twice.
 
+  cut now supports -F/--regex-fields=LIST to print fields in LIST (similar to
+  -f LIST) except DELIM is treated as regular expression delimiter.
+  That is, 'cut -d "[0-9]" -F2,5' will split input lines by regex delimiter
+  matching a single digit, then will print the 2nd and 5th fields.
+
 
 ** Bug fixes
 
-- 
2.30.2

From 7ddc0cf7a22a68f36e64b2a2c1d69e947e1d40fc Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Mon, 24 Jan 2022 23:22:55 -0700
Subject: [PATCH 16/18] doc: mention 'cut -O' in NEWS

---
 NEWS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/NEWS b/NEWS
index e90943d59..0decb53d3 100644
--- a/NEWS
+++ b/NEWS
@@ -75,6 +75,8 @@ GNU coreutils NEWS                                    -*- outline -*-
   now adjust /proc/$pid/cmdline to be more specific to the utility
   being run, rather than using the general "coreutils" binary name.
 
+  cut --output-delimiter now has a short-option equivalent: cut -O.
+
 
 * Noteworthy changes in release 9.0 (2021-09-24) [stable]
 
-- 
2.30.2

From db52925616d9dce82aff1abb9aae4da9f68e2f84 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Mon, 24 Jan 2022 23:31:17 -0700
Subject: [PATCH 17/18] doc: mention multibyte 'cut -c' in NEWS

---
 NEWS | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/NEWS b/NEWS
index 0decb53d3..df1ddfe8a 100644
--- a/NEWS
+++ b/NEWS
@@ -56,6 +56,13 @@ GNU coreutils NEWS                                    -*- outline -*-
   when the --foreground option is not specified.  This allows users to
   distinguish if the command was more forcefully terminated.
 
+  cut -c/--characters now supports multibyte locales (e.g. utf8). This could
+  lead to unexpected results, as prior to version 9.1 'cut -c' behaved
+  exactly the same as 'cut -b'.
+  That is: 'env printf "\U0001F600\n" | LOCALE=en_CA.UTF-8 cut -c1' will now
+  print a unicode Grinning Face character (U+1F600) instead of 0xF0 (the
+  first byte of its UTF-8 encoding).
+
 ** Improvements
 
   cp now uses openat and similar syscalls when copying to a directory.
-- 
2.30.2

From 7f8b55c68acad824143f44fe029fbb09eecdca05 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgor...@gmail.com>
Date: Tue, 25 Jan 2022 00:48:35 -0700
Subject: [PATCH 18/18] doc: expand 'cut' section

---
 doc/coreutils.texi | 265 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 238 insertions(+), 27 deletions(-)

diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index af87d4e6a..fd12a00cc 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -6160,12 +6160,12 @@ string between ranges of selected bytes.
 @opindex -c
 @opindex --characters
 Select for printing only the characters in positions listed in
-@var{character-list}.  The same as @option{-b} for now, but
-internationalization will change that.  Tabs and backspaces are
+@var{character-list}. Depending on the active locale, a single character
+might be composed of multiple bytes (e.g. UTF8). Tabs and backspaces are
 treated like any other character; they take up 1 character.  If an
 output delimiter is specified, (see the description of
 @option{--output-delimiter}), then output that string between ranges
-of selected bytes.
+of selected characters.
 
 @item -f @var{field-list}
 @itemx --fields=@var{field-list}
@@ -6176,36 +6176,47 @@ Fields are separated by a TAB character by default.  Also print any
 line that contains no delimiter character, unless the
 @option{--only-delimited} (@option{-s}) option is specified.
 
-Note @command{awk} supports more sophisticated field processing,
-like reordering fields, and handling fields aligned with blank characters.
-By default @command{awk} uses (and discards) runs of blank characters
-to separate fields, and ignores leading and trailing blanks.
-@example
-@verbatim
-awk '{print $2}'      # print the second field
-awk '{print $(NF-1)}' # print the penultimate field
-awk '{print $2,$1}'   # reorder the first two fields
-@end verbatim
-@end example
-Note while @command{cut} accepts field specifications in
-arbitrary order, output is always in the order encountered in the file.
+@item -F @var{field-list}
+@itemx --regex-fields=@var{field-list}
+@opindex -F
+@opindex --regex-fields
+Similar to @option{-f/--fields}, select for printing only the fields listed
+in @var{field-list}. Unlike @option{-f/--fields}, the delimiter is a regular
+expression (by default: whitespace, or custom regular expression with
+@option{-d DELIM}).
+The @option{-F} option implies @option{-s}.
 
-In the unlikely event that @command{awk} is unavailable,
-one can use the @command{join} command, to process blank
-characters as @command{awk} does above.
+
+@item -d @var{input_delim}
+@itemx --delimiter=@var{input_delim}
+@opindex -d
+@opindex --delimiter
+With @option{-f}, use the first byte of @var{input_delim} as
+the input fields separator (default is TAB).
+With @option{-F/--regex-fields}, treat @var{input_delim} as
+a basic regular expression (BRE) to separate fields.
+
+@item -D
+@item --definitive-list
+@opindex -D
+@opindex --definitive-list
+By default, @command{cut} sorts the field list and remove duplicated fields
+(from the @option{-b},@option{-c},@option{-f} options).
+With @option{-D}, @command{cut} will print the fields exactly as specified,
+unordered and including duplicates.
 @example
 @verbatim
-join -a1 -o 1.2     - /dev/null # print the second field
-join -a1 -o 1.2,1.1 - /dev/null # reorder the first two fields
+# print the 1st field, followed by the 3rd field.
+$ printf "abc\n" | cut -c 3,1,1
+ac
+
+# print the 3rd field, followed by the 1st field, twice.
+$ printf "abc\n" | cut -D -f 3,1,1
+caa
 @end verbatim
 @end example
 
-@item -d @var{input_delim_byte}
-@itemx --delimiter=@var{input_delim_byte}
-@opindex -d
-@opindex --delimiter
-With @option{-f}, use the first byte of @var{input_delim_byte} as
-the input fields separator (default is TAB).
+
 
 @item -n
 @opindex -n
@@ -6218,7 +6229,9 @@ Do not split multi-byte characters (no-op for now).
 For @option{-f}, do not print lines that do not contain the field separator
 character.  Normally, any line without a field separator is printed verbatim.
 
+@item -O @var{output_delim_string}
 @item --output-delimiter=@var{output_delim_string}
+@opindex -O
 @opindex --output-delimiter
 With @option{-f}, output fields are separated by @var{output_delim_string}.
 The default with @option{-f} is to use the input delimiter.
@@ -6235,6 +6248,7 @@ selected with the @option{-b}, @option{-c} or @option{-f} options.
 In other words, do @emph{not} print the bytes, characters or fields
 specified via those options.  This option is useful when you have
 many fields and want to print all but a few of them.
+@option{--complement} cannot be used with @option{-D}.
 
 @optZeroTerminated
 
@@ -6243,6 +6257,203 @@ many fields and want to print all but a few of them.
 @exitstatus
 
 
+@menu
+* cutting fields::            advanced @command{cut} usage.
+* output delimiter::          controlling output delimiter.
+* awk as alternative to cut:: sophisticated field processing.
+@end menu
+
+@node cutting fields
+@subsection advanced @command{cut} usage
+
+Traditionally, @command{cut} was limited to printing each output field once,
+and in strict order. Additionally, @command{cut} only allowed a single character
+as field delimiter.
+
+Examples:
+
+@exdent Output fields printed in order and without duplicates:
+@example
+@verbatim
+$ printf "a:b:c:d\n" | cut -d: -f1,3,3,1
+a:c
+@end verbatim
+@end example
+
+@exdent Input field delimiter was limited to a single character (TAB by default).
+In the following examples, the default delimiter (TAB) is used, and the space
+character is treated as part of the 3rd field:
+
+@example
+@verbatim
+$ printf "abc\tdef\tghi jkl\n" | cut -f3,1,1
+abc     ghi jkl
+$ printf "abc\tdef\tghi jkl\n" | cut -f1,3,3,1
+abc     ghi jkl
+@end verbatim
+@end example
+
+For this reason, it was commonly recommended to use @command{awk} for
+more sophisticated field processing (@pxref{awk as alternative to cut}).
+
+Starting with GNU @command{cut} version 9.1, @command{cut} supports the
+@option{-D/--definitive-list} option, which outputs the fields exactly as
+the specified by the user:
+
+@example
+@verbatim
+$ printf "a:b:c:d\n" | cut -D -d: -f1,3,3,1
+a:c:c:a
+@end verbatim
+@end example
+
+@command{cut} also supports @option{-F/--regex-fields}, which treats the
+input field delimiter as a regular expression instead of a single character.
+By default, whitespace (TAB and/or space, one or multiple) are treated as
+field delimiters:
+
+@example
+@verbatim
+$ printf "abc\tdef\tghi jkl\n" | cut --fields=1,3
+abc     ghi jkl
+
+$ printf "abc\tdef\tghi jkl\n" | cut --regex-fields=1,3
+abc ghi
+@end verbatim
+@end example
+
+Combine @option{-d} with @option{-F} to specify the regular expression to use
+as field delimiter instead of whitespace.
+In the following example, the regex delimiter is 'any character except digits':
+
+@example
+@verbatim
+$ printf "403-555.9999" | cut -d '[^0-9]' -F1
+403
+@end verbatim
+@end example
+
+
+With @option{-D} and @option{-F}, @command{cut} can now split input fields by
+whitespace and print them in any order, much like @command{awk}:
+
+@example
+@verbatim
+$ printf "World   Coreutils \t  Hello\n" | cut -D -F3,1
+Hello World
+
+$ printf "World   Coreutils \t  Hello\n" | awk '{print $3, $1}'
+Hello World
+@end verbatim
+@end example
+
+@node output delimiter
+@subsection Controlling the output delimiter
+
+The default output field delimiter is TAB for @option{-f} and none for
+@option{-b} and @option{-c}:
+
+@example
+@verbatim
+$ printf "a\tb\tc\n" | cut -f2,3
+b       c
+
+$ printf "abc\n" | cut -b2,3
+bc
+@end verbatim
+@end example
+
+Changing the input field delimiter with @option{-d} changes the output
+field delimiter as well (note that specificing @option{-d} with @option{-b}
+or @option{-c} is meaningless and will result in an error):
+
+@example
+@verbatim
+$ printf "a:b:c\n" | cut -d: -f2,3
+b:c
+@end verbatim
+@end example
+
+The @option{-O/--output-delimiter} can be used to specify an output delimiter
+that differs from the input delimiter (also, adding an output delimiter when
+using @option{-b} and @option{-c}):
+
+@example
+@verbatim
+$ printf "a:b:c\n" | cut -d: -f2,3 --output-delimiter=%
+b%c
+
+$ printf "abcde\n" | cut -b1,2,4 -O%
+a%b%d
+@end verbatim
+@end example
+
+The @option{-O/--output-delimiter} accepts strings, not just a single character:
+
+@example
+@verbatim
+$ printf "abcde\n" | cut -b1,2,4 -O"-1234-"
+a-1234-b-1234-d
+@end verbatim
+@end example
+
+As a special case, when @option{-O/--output-delimiter} is used with @option{-b}
+or @option{-c}, the output delimiter will be printed @emph{only} for fields
+separated by a comma in the field @var{LIST}:
+
+@example
+@verbatim
+$ printf "1234567890\n" | cut -b1-5 -O:
+12345
+
+$ printf "1234567890\n" | cut -b1,2,3,4,5 -O:
+1:2:3:4:5
+
+$ printf "1234567890\n" | cut -b1-3,4-5 -O:
+123:45
+@end verbatim
+@end example
+
+
+
+
+@node awk as alternative to cut
+@subsection @command{awk} as alternative to @command{cut}
+
+The following section pointed to @command{awk} as an alternative
+to using @command{cut} when more sophisticated field processing was needed.
+
+Starting with GNU @command{cut} version 9.1, @command{cut} supports
+advanced field processing options @option{-F/--regex-field} and
+@option{-D/--definitive-list}. These options suffice in many cases instead
+of using @command{awk}.
+The text below is kept for reference:
+
+Note @command{awk} supports more sophisticated field processing,
+like reordering fields, and handling fields aligned with blank characters.
+By default @command{awk} uses (and discards) runs of blank characters
+to separate fields, and ignores leading and trailing blanks.
+@example
+@verbatim
+awk '{print $2}'      # print the second field
+awk '{print $(NF-1)}' # print the penultimate field
+awk '{print $2,$1}'   # reorder the first two fields
+@end verbatim
+@end example
+Note while @command{cut} accepts field specifications in
+arbitrary order, output is always in the order encountered in the file.
+
+In the unlikely event that @command{awk} is unavailable,
+one can use the @command{join} command, to process blank
+characters as @command{awk} does above.
+@example
+@verbatim
+join -a1 -o 1.2     - /dev/null # print the second field
+join -a1 -o 1.2,1.1 - /dev/null # reorder the first two fields
+@end verbatim
+@end example
+
+
 @node paste invocation
 @section @command{paste}: Merge lines of files
 
-- 
2.30.2

Re: cut -DF

Reply via email to