commit 7d3e9c6e88474effad146d77c7d4d819d3a6f75d
Author: FRIGN <[email protected]>
Date:   Thu Jan 15 11:51:58 2015 +0100

    Resolve escape characters in tr(1)
    
    This is one aspect which I think has blown up the complexity of many
    tr-implementations around today.
    Instead of complicating the set-theory-based parser itself (he should
    still be relying on one rune per char, not multirunes), I added a
    preprocessor, which basically scans the code for upcoming '\'s, reads
    what he finds, substitutes the real character onto '\'s index and shifts
    the entire following array so there are no "holes".
    
    What is left to reflect on is what to do with octal sequences.
    I have a local implementation here, which works fine, but imho,
    given tr is already so focused on UTF-8, we might as well ignore
    POSIX at this point and rather implement the unicode UTF-8 code points,
    which are way more contemporary and future-proof.
    
    Reading in \uC3A4 as a an array of 0xC3 and 0xA4 is not the issue,
    but I'm still struggling to find a way to turn it into a well-formed
    byte sequence. Hit me with a mail if you have a simple solution for
    that.

diff --git a/tr.c b/tr.c
index d2bf7e0..442fb04 100644
--- a/tr.c
+++ b/tr.c
@@ -70,6 +70,36 @@ rstrmatch(Rune *r, char *s, size_t n)
 }
 
 static size_t
+resolveescapes(Rune *r, size_t len)
+{
+       size_t i, off, m;
+
+       for (i = 0; i < len - 1; i++) {
+               if (r[i] != '\\')
+                       continue;
+               off = 0;
+
+               switch (r[i + 1]) {
+               case '\\': r[i] = '\\'; off++; break;
+               case 'a':  r[i] = '\a'; off++; break;
+               case 'b':  r[i] = '\b'; off++; break;
+               case 'f':  r[i] = '\f'; off++; break;
+               case 'n':  r[i] = '\n'; off++; break;
+               case 'r':  r[i] = '\r'; off++; break;
+               case 't':  r[i] = '\t'; off++; break;
+               case 'v':  r[i] = '\v'; off++; break;
+               default:   continue;
+               }
+
+               for (m = i + 1; m <= len - off; m++)
+                       r[m] = r[m + off];
+               len -= off;
+       }
+
+       return len;
+}
+
+static size_t
 makeset(char *str, struct range **set, int (**check)(wint_t))
 {
        Rune  *rstr;
@@ -79,9 +109,9 @@ makeset(char *str, struct range **set, int (**check)(wint_t))
 
        /* rstr defines at most len ranges */
        len = chartorunearr(str, &rstr);
+       len = resolveescapes(rstr, len);
        *set = emalloc(len * sizeof(**set));
 
-       /* todo: allow expressions */
        for (i = 0; i < len; i++) {
                if (rstr[i] == '[') {
                        j = i;

Reply via email to