Aharon Robbins wrote:
that exclusive-ORing looks a little scary to me.

It is derived from a coding hack that I picked up from Dijkstra back in the 1970s. He gave the most boring computer-science lecture I have ever attended -- so boring that Kit Fine walked out a few minutes into it -- but I stubbornly stayed through to the end and I've never forgotten the hack.

The hack works everywhere, including platforms that use EBCDIC, shift-JIS, DBCS, etc., because it doesn't rely on the encoding scheme at all. Attached is a revised patch that adds some commentary and breaks the hack into some functions that I hope help explain things. Sorry, I don't know how to break this into smaller patches that would be easier to understand.
>From 859b9496860e67d01e32a58e9f2a098410775a22 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Tue, 28 Jan 2014 13:47:47 -0800
Subject: [PATCH] Simplify handling of letter case.

* src/dfa.c (setbit_wc, setbit_case_fold_c, atom): Simplify.
(xor_other, xor_wother, to_other, to_wother): New functions.
(setbit_case_fold_c, parse_bracket_exp, lex, atom): Use them
to invoke tolower and toupper instead of isalpha followed by one or
the other, and similarly for towlower, towupper, iswalpha.  This should
lead to more-efficient handling of caseless letters, and it simplifies
the code.
---
 src/dfa.c | 130 +++++++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 81 insertions(+), 49 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index b79c604..af830a8 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -662,6 +662,42 @@ wchar_context (wint_t wc)
   return CTX_NONE;
 }
 
+/* The following functions exploit the commutativity and associativity of ^,
+   and the fact that X ^ X is zero.  POSIX requires that C equals
+   either tolower (C) or toupper (C); if the former, then C ^ tolower (C)
+   is zero so C ^ xor_other (C) equals toupper (C), and similarly
+   for the latter.  */
+
+/* Return the exclusive-OR of C and C's other case, or zero if C is
+   not a letter that changes case.  */
+
+static int
+xor_other (int c)
+{
+  return tolower (c) ^ toupper (c);
+}
+
+static wint_t
+xor_wother (wint_t c)
+{
+  return towlower (c) ^ towupper (c);
+}
+
+/* If C is a lowercase letter, return its uppercase version, and vice versa.
+   Return C if it's not a letter that changes case.  */
+
+static int
+to_other (int c)
+{
+  return c ^ xor_other (c);
+}
+
+static wint_t
+to_wother (wint_t c)
+{
+  return c ^ xor_wother (c);
+}
+
 /* Entry point to set syntax options.  */
 void
 dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
@@ -693,39 +729,24 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
    this may happen when folding case in weird Turkish locales where
    dotless i/dotted I are not included in the chosen character set.
    Return whether a bit was set in the charclass.  */
-#if MBS_SUPPORT
 static bool
 setbit_wc (wint_t wc, charclass c)
 {
+#if MBS_SUPPORT
   int b = wctob (wc);
   if (b == EOF)
     return false;
 
   setbit (b, c);
   return true;
-}
-
-/* Set a bit in the charclass for the given single byte character,
-   if it is valid in the current character set.  */
-static void
-setbit_c (int b, charclass c)
-{
-  /* Do nothing if b is invalid in this character set.  */
-  if (MB_CUR_MAX > 1 && btowc (b) == WEOF)
-    return;
-  setbit (b, c);
-}
 #else
-# define setbit_c setbit
-static inline bool
-setbit_wc (wint_t wc, charclass c)
-{
   abort ();
    /*NOTREACHED*/ return false;
-}
 #endif
+}
 
-/* Like setbit_c, but if case is folded, set both cases of a letter.  For
+/* Like setbit_wc but for a single-byte character B; and if case is
+   folded, set both cases of a letter.  For
    MB_CUR_MAX > 1, the resulting charset is only used as an optimization,
    and the caller takes care of setting the appropriate field of struct
    mb_char_classes.  */
@@ -737,16 +758,16 @@ setbit_case_fold_c (int b, charclass c)
       wint_t wc = btowc (b);
       if (wc == WEOF)
         return;
-      setbit (b, c);
-      if (case_fold && iswalpha (wc))
-        setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c);
+      if (case_fold)
+        setbit_wc (to_wother (wc), c);
     }
   else
     {
-      setbit (b, c);
-      if (case_fold && isalpha (b))
-        setbit_c (isupper (b) ? tolower (b) : toupper (b), c);
+      if (case_fold)
+        setbit (to_other (b), c);
     }
+
+  setbit (b, c);
 }
 
 
@@ -1085,23 +1106,30 @@ parse_bracket_exp (void)
             {
               /* When case folding map a range, say [m-z] (or even [M-z])
                  to the pair of ranges, [m-z] [M-Z].  */
+              wchar_t lo1 = wc, hi1 = wc2, lo2 = wc, hi2 = wc2;
+              if (case_fold)
+                {
+                  lo1 = towlower (lo1);
+                  hi1 = towlower (hi1);
+                  lo2 = towupper (lo2);
+                  hi2 = towupper (hi2);
+                }
+
               REALLOC_IF_NECESSARY (work_mbc->range_sts,
                                     range_sts_al, work_mbc->nranges + 1);
               REALLOC_IF_NECESSARY (work_mbc->range_ends,
                                     range_ends_al, work_mbc->nranges + 1);
-              work_mbc->range_sts[work_mbc->nranges] =
-                case_fold ? towlower (wc) : (wchar_t) wc;
-              work_mbc->range_ends[work_mbc->nranges++] =
-                case_fold ? towlower (wc2) : (wchar_t) wc2;
+              work_mbc->range_sts[work_mbc->nranges] = lo1;
+              work_mbc->range_ends[work_mbc->nranges++] = hi1;
 
-              if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
+              if (lo1 != lo2 || hi1 != hi2)
                 {
                   REALLOC_IF_NECESSARY (work_mbc->range_sts,
                                         range_sts_al, work_mbc->nranges + 1);
-                  work_mbc->range_sts[work_mbc->nranges] = towupper (wc);
+                  work_mbc->range_sts[work_mbc->nranges] = lo2;
                   REALLOC_IF_NECESSARY (work_mbc->range_ends,
                                         range_ends_al, work_mbc->nranges + 1);
-                  work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
+                  work_mbc->range_ends[work_mbc->nranges++] = hi2;
                 }
             }
           else
@@ -1129,16 +1157,19 @@ parse_bracket_exp (void)
           continue;
         }
 
-      if (case_fold && iswalpha (wc))
+      if (case_fold)
         {
-          wc = towlower (wc);
-          if (!setbit_wc (wc, ccl))
+          wchar_t xor = xor_wother (wc);
+          if (xor)
             {
-              REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
-                                    work_mbc->nchars + 1);
-              work_mbc->chars[work_mbc->nchars++] = wc;
+              wchar_t other = wc ^ xor;
+              if (!setbit_wc (other, ccl))
+                {
+                  REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
+                                        work_mbc->nchars + 1);
+                  work_mbc->chars[work_mbc->nchars++] = other;
+                }
             }
-          wc = towupper (wc);
         }
       if (!setbit_wc (wc, ccl))
         {
@@ -1481,7 +1512,7 @@ lex (void)
           if (MB_CUR_MAX > 1)
             return lasttok = WCHAR;
 
-          if (case_fold && isalpha (c))
+          if (case_fold && tolower (c) != toupper (c))
             {
               zeroset (ccl);
               setbit_case_fold_c (c, ccl);
@@ -1725,17 +1756,18 @@ add_utf8_anychar (void)
 static void
 atom (void)
 {
-  if (0)
-    {
-      /* empty */
-    }
-  else if (MBS_SUPPORT && tok == WCHAR)
+  if (MBS_SUPPORT && tok == WCHAR)
     {
-      addtok_wc (case_fold ? towlower (wctok) : wctok);
-      if (case_fold && iswalpha (wctok))
+      wchar_t wc = wctok;
+      addtok_wc (wc);
+      if (case_fold)
         {
-          addtok_wc (towupper (wctok));
-          addtok (OR);
+          wchar_t xor = xor_wother (wc);
+          if (xor)
+            {
+              addtok_wc (wc ^ xor);
+              addtok (OR);
+            }
         }
 
       tok = lex ();
-- 
1.8.5.3

Reply via email to