bug#15192: UTF-16 surrogate pair handling in grep -i option

Jim Meyering Sat, 31 Aug 2013 13:26:37 -0700

Hi Corinna,
Sorry about the delay.
I'm prepared to push the following.  Are you ok with that?


On Tue, Aug 27, 2013 at 2:35 AM, Corinna Vinschen <[email protected]> wrote:
> On Aug 26 21:58, Jim Meyering wrote:
>> I guess it is a different point of view.  Maybe I'm just too
>> forward-thinking? :-)
>> I.e., if the remaining cygwin-specific bug is fixed soon, there will
>> be little reason for separate tests.
>> Are you planning to work on the cygwin/regexp bug?
>
> Not in the next couple of weeks.  I'll be abroad for a while.  Maybe in
> October or November.  I put it on my TODO list.  The biggest problem is
> that I know the BSD regex code pretty well, but the gnulib regex code is
> very different.  I already had a look two weeks ago, but I have not
> found the right place to mount surrogate pairs into it :(
>
>
> Corinna
>
> --
> Corinna Vinschen
> Cygwin Maintainer
> Red Hat

From f2e995fe6bf1889f4d8e0ae40054ee2bd1ddab3d Mon Sep 17 00:00:00 2001
From: Corinna Vinschen <[email protected]>
Date: Fri, 16 Aug 2013 20:50:12 +0200
Subject: [PATCH] fix Cygwin UTF-16 surrogate-pair handling with -i

grep -i would segfault on systems using UTF-16-based wchar_t (Cygwin)
when converting an input string containing certain 4-byte UTF-8
sequences to lower case.  The conversions to wchar_t and back to
a UTF-8 multibyte string did not take surrogate pairs into account.
* src/searchutils.c (mbtolower) [__CYGWIN__]: Detect and handle
surrogate pairs when converting.
* NEWS (Bug fixes): Mention it.
* tests/surrogate-pair: New test.
* tests/Makefile.am (TESTS): Add it.
Reported by: Jim Burwell
---
 NEWS                 |  6 ++++++
 src/searchutils.c    | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 tests/Makefile.am    |  1 +
 tests/surrogate-pair | 44 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100755 tests/surrogate-pair

diff --git a/NEWS b/NEWS
index 8796a1b..fba3094 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,12 @@ GNU grep NEWS                                    -*- outline 
-*-

 ** Bug fixes

+  grep -i would segfault on systems using UTF-16-based wchar_t (Cygwin)
+  when converting an input string containing certain 4-byte UTF-8
+  sequences to lower case.  The conversions to wchar_t and back to
+  a UTF-8 multibyte string did not take surrogate pairs into account.
+  [bug present since at least grep-2.6, though the segfault is new with 2.13]
+
   grep -E would segfault when given a regexp like '([^.]*[M]){1,2}'
   for any multibyte character M. [bug introduced in grep-2.6, which would
   segfault, but 2.7 and 2.8 had no problem, and 2.9 through 2.14 would
diff --git a/src/searchutils.c b/src/searchutils.c
index 160f80d..f27a01b 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -111,6 +111,37 @@ mbtolower (const char *beg, size_t *n, mb_len_map_t 
**len_map_p)
     {
       wchar_t wc;
       size_t mbclen = mbrtowc (&wc, beg, end - beg, &is);
+#ifdef __CYGWIN__
+      /* Handle a UTF-8 sequence for a character beyond the base plane.
+         Cygwin's wchar_t is UTF-16, as in the underlying OS.  This
+         results in surrogate pairs which need some extra attention.  */
+      wint_t wci = 0;
+      if (mbclen == 3 && (wc & 0xdc00) == 0xd800)
+        {
+          /* We got the start of a 4 byte UTF-8 sequence.  This is returned
+             as a UTF-16 surrogate pair.  The first call to mbrtowc returned 3
+             and wc has been set to a high surrogate value, now we're going
+             to fetch the matching low surrogate.  This second call to mbrtowc
+             is supposed to return 1 to complete the 4 byte UTF-8 sequence.  */
+          wchar_t wc_2;
+          size_t mbclen_2 = mbrtowc (&wc_2, beg + mbclen, end - beg - mbclen,
+                                     &is);
+          if (mbclen_2 == 1 && (wc_2 & 0xdc00) == 0xdc00)
+            {
+              /* Match.  Convert this to a 4 byte wint_t which constitutes
+                 a 32-bit UTF-32 value.  */
+              wci = ( (((wint_t) (wc - 0xd800)) << 10)
+                     | ((wint_t) (wc_2 - 0xdc00)))
+                    + 0x10000;
+              ++mbclen;
+            }
+          else
+            {
+              /* Invalid UTF-8 sequence.  */
+              mbclen = (size_t) -1;
+            }
+        }
+#endif
       if (outlen + mb_cur_max >= outalloc)
         {
           size_t dm = m - len_map;
@@ -132,8 +163,35 @@ mbtolower (const char *beg, size_t *n, mb_len_map_t 
**len_map_p)
         }
       else
         {
+          size_t ombclen;
           beg += mbclen;
-          size_t ombclen = wcrtomb (p, towlower ((wint_t) wc), &os);
+#ifdef __CYGWIN__
+          /* Handle Unicode characters beyond the base plane.  */
+          if (mbclen == 4)
+            {
+              /* towlower, taking wint_t (4 bytes), handles UCS-4 values.  */
+              wci = towlower (wci);
+              if (wci >= 0x10000)
+                {
+                  wci -= 0x10000;
+                  wc = (wci >> 10) | 0xd800;
+                  /* No need to check the return value.  When reading the
+                     high surrogate, the return value will be 0 and only the
+                     mbstate indicates that we're in the middle of reading a
+                     surrogate pair.  The next wcrtomb call reading the low
+                     surrogate will then return 4 and reset the mbstate.  */
+                  wcrtomb (p, wc, &os);
+                  wc = (wci & 0x3ff) | 0xdc00;
+                }
+              else
+                {
+                  wc = (wchar_t) wci;
+                }
+              ombclen = wcrtomb (p, wc, &os);
+            }
+          else
+#endif
+          ombclen = wcrtomb (p, towlower ((wint_t) wc), &os);
           *m = mbclen - ombclen;
           memset (m + 1, 0, ombclen - 1);
           m += ombclen;
diff --git a/tests/Makefile.am b/tests/Makefile.am
index c96ce06..581f688 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -87,6 +87,7 @@ TESTS =                                               \
   spencer1                                     \
   spencer1-locale                              \
   status                                       \
+  surrogate-pair                               \
   symlink                                      \
   turkish-I                                    \
   turkish-I-without-dot                                \
diff --git a/tests/surrogate-pair b/tests/surrogate-pair
new file mode 100755
index 0000000..a589819
--- /dev/null
+++ b/tests/surrogate-pair
@@ -0,0 +1,44 @@
+#!/bin/sh
+# Trigger a segfault-inducing bug with -i in grep-2.14 on Cygwin.
+
+# Copyright (C) 2013 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+
+require_en_utf8_locale_
+require_compiled_in_MB_support
+
+fail=0
+
+printf '\xf0\x90\x90\x85\n' > in || framework_failure_
+
+LC_ALL=en_US.UTF-8
+export LC_ALL
+
+# On Cygwin, before grep-2.15, this would segfault.
+grep -i anything-else in > out 2>&1
+# Require not just non-zero exit status, but exactly 1.
+test $? = 1 || fail=1
+# Expect no output.
+compare /dev/null out || fail=1
+
+# Also test whether a surrogate-pair in the search string works.
+for opt in '' -i -E -F -iE -iF; do
+  grep --file=in $opt in > out 2>&1 || fail=1
+  compare out in || fail=1
+done
+
+Exit $fail
-- 
1.8.4.21.g992c386

bug#15192: UTF-16 surrogate pair handling in grep -i option

Reply via email to