bug#16911: [PATCH] grep: fix bugs with -i and titlecase

Paul Eggert Fri, 28 Feb 2014 22:55:17 -0800

Tags: patch

The attached patch, which I've pushed, fixes a problem with grep -i andtitlecase that's been bugging me ever since someone pointed out sometitlecase issues on the grep mailing list a few weeks ago. It affectsdfa.c, so I expect it'll fix a similar problem with gawk.

From f1a3831c32850859cd5faddb1749c095a89a2a84 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Fri, 28 Feb 2014 22:46:02 -0800
Subject: [PATCH] grep: fix bugs with -i and titlecase


* NEWS: Document this.
* src/dfa.c (setbit_wc): Simplify.
(setbit_c): Remove; no longer used.
(setbit_case_fold_c, parse_bracket_exp, atom):
Don't mishandle titlecase.  For 'atom', this removes the need for
the refactoring of Bug#16729.
(lex): Use the slower approach only for letters that have a
differing case.
* tests/case-fold-titlecase: New file.
* tests/Makefile.am (TESTS): Add it.
---
 NEWS                      |   5 ++
 src/dfa.c                 | 159 +++++++++++++++++++++++-----------------------
 tests/Makefile.am         |   1 +
 tests/case-fold-titlecase |  41 ++++++++++++
 4 files changed, 127 insertions(+), 79 deletions(-)
 create mode 100755 tests/case-fold-titlecase

diff --git a/NEWS b/NEWS
index 6cfcaba..4b1364c 100644
--- a/NEWS
+++ b/NEWS
@@ -19,6 +19,11 @@ GNU grep NEWS                                    -*- outline 
-*-
   echo a@@a| grep -w @@ would not.  Now, they both fail to match,
   per the documentation on how grep's -w works.
 
+  grep -i no longer mishandles patterns containing titlecase characters.
+  For example, in a locale containing the titlecase character
+  'ǈ' (U+01C8 LATIN CAPITAL LETTER L WITH SMALL LETTER J),
+  'grep -i ǈ' now matches 'Ǉ' (U+01C7 LATIN CAPITAL LETTER LJ).
+
 
 * Noteworthy changes in release 2.18 (2014-02-20) [stable]
 
diff --git a/src/dfa.c b/src/dfa.c
index 4708895..b3d9da8 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -694,42 +694,27 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
    this may happen when folding case in weird Turkish locales where
    dotless i/dotted I are not included in the chosen character set.
    Return whether a bit was set in the charclass.  */
-#if MBS_SUPPORT
 static bool
 setbit_wc (wint_t wc, charclass c)
 {
+#if MBS_SUPPORT
   int b = wctob (wc);
   if (b == EOF)
     return false;
 
   setbit (b, c);
   return true;
-}
-
-/* Set a bit in the charclass for the given single byte character,
-   if it is valid in the current character set.  */
-static void
-setbit_c (int b, charclass c)
-{
-  /* Do nothing if b is invalid in this character set.  */
-  if (MB_CUR_MAX > 1 && btowc (b) == WEOF)
-    return;
-  setbit (b, c);
-}
 #else
-# define setbit_c setbit
-static inline bool
-setbit_wc (wint_t wc, charclass c)
-{
   abort ();
    /*NOTREACHED*/ return false;
-}
 #endif
+}
 
-/* Like setbit_c, but if case is folded, set both cases of a letter.  For
-   MB_CUR_MAX > 1, the resulting charset is only used as an optimization,
-   and the caller takes care of setting the appropriate field of struct
-   mb_char_classes.  */
+/* Set a bit for B in the charclass C, if B is a valid single byte
+   character in the current character set.  If case is folded, set B's
+   lower and upper case variants similarly.  If MB_CUR_MAX > 1, the
+   resulting charset is used only as an optimization, and the caller
+   should set the appropriate field of struct mb_char_classes.  */
 static void
 setbit_case_fold_c (int b, charclass c)
 {
@@ -738,16 +723,21 @@ setbit_case_fold_c (int b, charclass c)
       wint_t wc = btowc (b);
       if (wc == WEOF)
         return;
-      setbit (b, c);
-      if (case_fold && iswalpha (wc))
-        setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c);
+      if (case_fold)
+        {
+          setbit_wc (towlower (wc), c);
+          setbit_wc (towupper (wc), c);
+        }
     }
   else
     {
-      setbit (b, c);
-      if (case_fold && isalpha (b))
-        setbit_c (isupper (b) ? tolower (b) : toupper (b), c);
+      if (case_fold)
+        {
+          setbit (tolower (b), c);
+          setbit (toupper (b), c);
+        }
     }
+  setbit (b, c);
 }
 
 
@@ -1104,52 +1094,51 @@ parse_bracket_exp (void)
               c2 = ']';
             }
 
-          if (c2 == ']')
+          if (c2 != ']')
             {
-              /* In the case [x-], the - is an ordinary hyphen,
-                 which is left in c1, the lookahead character.  */
-              lexptr -= cur_mb_len;
-              lexleft += cur_mb_len;
-            }
-        }
-
-      if (c1 == '-' && c2 != ']')
-        {
-          if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
-            FETCH_WC (c2, wc2, _("unbalanced ["));
+              if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
+                FETCH_WC (c2, wc2, _("unbalanced ["));
 
-          if (MB_CUR_MAX > 1)
-            {
-              /* When case folding map a range, say [m-z] (or even [M-z])
-                 to the pair of ranges, [m-z] [M-Z].  */
-              REALLOC_IF_NECESSARY (work_mbc->range_sts,
-                                    range_sts_al, work_mbc->nranges + 1);
-              REALLOC_IF_NECESSARY (work_mbc->range_ends,
-                                    range_ends_al, work_mbc->nranges + 1);
-              work_mbc->range_sts[work_mbc->nranges] =
-                case_fold ? towlower (wc) : (wchar_t) wc;
-              work_mbc->range_ends[work_mbc->nranges++] =
-                case_fold ? towlower (wc2) : (wchar_t) wc2;
-
-              if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
+              if (MB_CUR_MAX > 1)
                 {
+                  /* When case folding map a range, say [m-z] (or even [M-z])
+                     to the pair of ranges, [m-z] [M-Z].  Although this code
+                     is wrong in multiple ways, it's never used in practice.
+                     FIXME: Remove this (and related) unused code.  */
                   REALLOC_IF_NECESSARY (work_mbc->range_sts,
                                         range_sts_al, work_mbc->nranges + 1);
-                  work_mbc->range_sts[work_mbc->nranges] = towupper (wc);
                   REALLOC_IF_NECESSARY (work_mbc->range_ends,
                                         range_ends_al, work_mbc->nranges + 1);
-                  work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
+                  work_mbc->range_sts[work_mbc->nranges] =
+                    case_fold ? towlower (wc) : (wchar_t) wc;
+                  work_mbc->range_ends[work_mbc->nranges++] =
+                    case_fold ? towlower (wc2) : (wchar_t) wc2;
+
+                  if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
+                    {
+                      REALLOC_IF_NECESSARY (work_mbc->range_sts,
+                                            range_sts_al, work_mbc->nranges + 
1);
+                      work_mbc->range_sts[work_mbc->nranges] = towupper (wc);
+                      REALLOC_IF_NECESSARY (work_mbc->range_ends,
+                                            range_ends_al, work_mbc->nranges + 
1);
+                      work_mbc->range_ends[work_mbc->nranges++] = towupper 
(wc2);
+                    }
                 }
+              else if (using_simple_locale ())
+                for (; c <= c2; c++)
+                  setbit_case_fold_c (c, ccl);
+              else
+                known_bracket_exp = false;
+
+              colon_warning_state |= 8;
+              FETCH_WC (c1, wc1, _("unbalanced ["));
+              continue;
             }
-          else if (using_simple_locale ())
-            for (; c <= c2; c++)
-              setbit_case_fold_c (c, ccl);
-          else
-            known_bracket_exp = false;
 
-          colon_warning_state |= 8;
-          FETCH_WC (c1, wc1, _("unbalanced ["));
-          continue;
+          /* In the case [x-], the - is an ordinary hyphen,
+             which is left in c1, the lookahead character.  */
+          lexptr -= cur_mb_len;
+          lexleft += cur_mb_len;
         }
 
       colon_warning_state |= (c == ':') ? 2 : 4;
@@ -1160,16 +1149,22 @@ parse_bracket_exp (void)
           continue;
         }
 
-      if (case_fold && iswalpha (wc))
+      if (case_fold)
         {
-          wc = towlower (wc);
-          if (!setbit_wc (wc, ccl))
+          wint_t folded = towlower (wc);
+          if (folded != wc && !setbit_wc (folded, ccl))
+            {
+              REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
+                                    work_mbc->nchars + 1);
+              work_mbc->chars[work_mbc->nchars++] = folded;
+            }
+          folded = towupper (wc);
+          if (folded != wc && !setbit_wc (folded, ccl))
             {
               REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
                                     work_mbc->nchars + 1);
-              work_mbc->chars[work_mbc->nchars++] = wc;
+              work_mbc->chars[work_mbc->nchars++] = folded;
             }
-          wc = towupper (wc);
         }
       if (!setbit_wc (wc, ccl))
         {
@@ -1515,7 +1510,7 @@ lex (void)
           if (MB_CUR_MAX > 1)
             return lasttok = WCHAR;
 
-          if (case_fold && isalpha (c))
+          if (case_fold && (tolower (c) != c || toupper (c) != c))
             {
               zeroset (ccl);
               setbit_case_fold_c (c, ccl);
@@ -1759,17 +1754,23 @@ add_utf8_anychar (void)
 static void
 atom (void)
 {
-  if (0)
+  if (MBS_SUPPORT && tok == WCHAR)
     {
-      /* empty */
-    }
-  else if (MBS_SUPPORT && tok == WCHAR)
-    {
-      addtok_wc (case_fold ? towlower (wctok) : wctok);
-      if (case_fold && iswalpha (wctok))
+      addtok_wc (wctok);
+      if (case_fold)
         {
-          addtok_wc (towupper (wctok));
-          addtok (OR);
+          wint_t folded = towlower (wctok);
+          if (folded != wctok)
+            {
+              addtok_wc (folded);
+              addtok (OR);
+            }
+          folded = towupper (wctok);
+          if (folded != wctok)
+            {
+              addtok_wc (folded);
+              addtok (OR);
+            }
         }
 
       tok = lex ();
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 972ffc5..219e96a 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -47,6 +47,7 @@ TESTS =                                               \
   case-fold-char-class                         \
   case-fold-char-range                         \
   case-fold-char-type                          \
+  case-fold-titlecase                          \
   char-class-multibyte                         \
   char-class-multibyte2                                \
   dfa-coverage                                 \
diff --git a/tests/case-fold-titlecase b/tests/case-fold-titlecase
new file mode 100755
index 0000000..0ece5c8
--- /dev/null
+++ b/tests/case-fold-titlecase
@@ -0,0 +1,41 @@
+#!/bin/sh
+# Check that case folding works even with titlecase characters.
+
+# Copyright 2014 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+
+require_en_utf8_locale_
+require_compiled_in_MB_support
+LC_ALL=en_US.UTF-8
+export LC_ALL
+
+fail=0
+
+LJ='\307\207' # U+01C7 LATIN CAPITAL LETTER LJ
+Lj='\307\210' # U+01C8 LATIN CAPITAL LETTER L WITH SMALL LETTER J
+lj='\307\211' # U+01C9 LATIN SMALL LETTER LJ
+pattern=$(printf "$Lj\n") || framework_failure_
+printf "$lj$lj\n$Lj$Lj\n$LJ$LJ\n" >in || framework_failure_
+
+grep -i "$pattern" in >out || fail=1
+compare in out || fail=1
+
+pattern="($pattern)\\1"
+grep -Ei "$pattern" in >out || fail=1
+compare in out || fail=1
+
+Exit $fail
-- 
1.8.5.3

bug#16911: [PATCH] grep: fix bugs with -i and titlecase

Reply via email to