[Implemented] [coreutils] Partial UTF-8 support for "cut -c"

jaime.mosquera Mon, 12 Aug 2019 13:13:31 -0700

Good evening.

I have partially implemented the option "-c" ("--characters") for UTF-8 
non-ASCII characters, so that using a text in any language other than English 
does not result in rather subtle bugs ("cut -c 1-79" produces 79 characters, 
except that lines with one accented letter are one character short; 
furthermore, depending on where you cut, you may get "partial", unprintable 
characters). My modifications are attached as a patch file (created through 
git) to the last version found on GitHub (as cloned earlier today).


This implementation has two, somewhat important shortcomings:

* Other encodings are not implemented. It should not be too difficult to 
implement UTF-16, and UTF-32 definitely less so, but branching between them 
would make the code a bit more difficult to understand and require a simple way 
to detect the current encoding and act accordingly. Furthermore, more encodings 
would be needed (Japan still uses non-Unicode encodings with some frequency), 
so I decided to stick with just UTF-8.

* Modifier characters are treated as individual characters, instead of being 
processed along with the characters they modify, as Unicode dictates. 
Decisively, many languages from Western Europe (Spanish, Portuguese...) might 
or might not work with this program, depending on which kind of accented 
letters are produced (on my computer it worked perfectly).

On the other hand, missing bytes in a multibyte UTF-8 characters are correctly 
handled (the incomplete character is printed, but the next character is read 
whole, without misreading any bytes as part of the previous character).

It is my hope that you should find this first approach to the problem 
sufficient for most uses, and incorporate it into the mainstream code.

Greetings.

(Should my modifications be big enough to require it for copyright reasons, my 
name is "Jaime Mosquera", and I obviously agree to the terms of the GNU GPL.)

diff --git a/src/cut.c b/src/cut.c
index bb2e641f7..8f156ad78 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -80,6 +80,9 @@ enum operating_mode
     /* Output characters that are in the given bytes. */
     byte_mode,
 
+    /* Output characters that are in the given characters. */
+    char_mode,
+
     /* Output the given delimiter-separated fields. */
     field_mode
   };
@@ -137,6 +140,40 @@ static struct option const longopts[] =
   {NULL, 0, NULL, 0}
 };
 
+
+static
+int getUTF8 (FILE* stream)
+{
+  int c, ch;
+  int n, i;
+
+  c = getc (stream);
+  if (c == EOF)
+    return c;
+  if ((c >> 5) == 6)
+    n = 1;
+  else if ((c >> 4) == 14)
+    n = 2;
+  else if ((c >> 3) == 30)
+    n = 3;
+  else
+    n = 0;
+
+  for (i = 0; i < n; i++)
+  {
+    ch = getc (stream);
+    if ((ch >> 6) == 2)
+      c = (c << 8) + ch;
+    else
+    {
+      ungetc (ch, stream);
+      break;
+    }
+  }
+
+  return c;
+}
+
 void
 usage (int status)
 {
@@ -280,6 +317,71 @@ cut_bytes (FILE *stream)
     }
 }
 
+
+/* Read from stream STREAM, printing to standard output any selected characters.  */
+
+static void
+cut_chars (FILE *stream)
+{
+  uintmax_t char_idx;	/* Number of bytes in the line so far. */
+  /* Whether to begin printing delimiters between ranges for the current line.
+     Set after we've begun printing data corresponding to the first range.  */
+  bool print_delimiter;
+
+  char_idx = 0;
+  print_delimiter = false;
+  current_rp = frp;
+  while (true)
+    {
+      int c;		/* Each character from the file. */
+      unsigned int ch;
+      int i;
+      char str[5];
+
+      c = getUTF8 (stream);
+      // c = getc (stream);
+
+      if (c == line_delim)
+        {
+          putchar (c);
+          char_idx = 0;
+          print_delimiter = false;
+          current_rp = frp;
+        }
+      else if (c == EOF)
+        {
+          if (char_idx > 0)
+            putchar (line_delim);
+          break;
+        }
+      else
+        {
+          ch = *(unsigned int*) &c;
+          next_item (&char_idx);
+          if (print_kth (char_idx))
+            {
+              if (output_delimiter_specified)
+                {
+                  if (print_delimiter && is_range_start_index (char_idx))
+                    {
+                      fwrite (output_delimiter_string, sizeof (char),
+                              output_delimiter_length, stdout);
+                    }
+                  print_delimiter = true;
+                }
+
+	      for (i = 3; i >= 0; i--, ch /= 256)
+		str[i] = ch % 256;
+	      str[4] = 0;
+
+	      for (i = 0; i < 4; i++)
+		if (str[i] != 0)
+                  putchar ((unsigned char) str[i]);
+            }
+        }
+    }
+}
+
 /* Read from stream STREAM, printing to standard output any selected fields.  */
 
 static void
@@ -430,6 +532,8 @@ cut_stream (FILE *stream)
 {
   if (operating_mode == byte_mode)
     cut_bytes (stream);
+  else if (operating_mode == char_mode)
+    cut_chars (stream);
   else
     cut_fields (stream);
 }
@@ -505,7 +609,6 @@ main (int argc, char **argv)
       switch (optc)
         {
         case 'b':
-        case 'c':
           /* Build the byte list. */
           if (operating_mode != undefined_mode)
             FATAL_ERROR (_("only one type of list may be specified"));
@@ -513,6 +616,14 @@ main (int argc, char **argv)
           spec_list_string = optarg;
           break;
 
+        case 'c':
+          /* Build the char list. */
+          if (operating_mode != undefined_mode)
+            FATAL_ERROR (_("only one type of list may be specified"));
+          operating_mode = char_mode;
+          spec_list_string = optarg;
+          break;
+
         case 'f':
           /* Build the field list. */
           if (operating_mode != undefined_mode)

[Implemented] [coreutils] Partial UTF-8 support for "cut -c"

Reply via email to