Jim's fix for the fgrep infinite loop would erroneously miss matches
in SJIS character sets.  In this character set low bytes (i.e. ASCII
bytes) are also valid second bytes in a double-byte character, so you
have to continue looking for a match, even if you match in the middle
of a double-byte character.

The attached test will be skipped unless (on a glibc system) you run
something like

  mkdir /usr/lib/locale/ja_JP.SHIFT_JIS
  zcat /usr/share/i18n/charmaps/SHIFT_JIS.gz | \
    localedef \
      -f - \
      -i /usr/share/i18n/locales/ja_JP \
      /usr/lib/locale/ja_JP.SHIFT_JIS

I verified that it fails before applying the first patch, and passes
with it.

Paolo Bonzini (2):
  grep -F: fix a bug with SJIS character sets
  tests: add tests for SJIS character sets

 src/kwsearch.c    |   16 +++++++++++-----
 tests/Makefile.am |    1 +
 tests/sjis-mb     |   50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+), 5 deletions(-)
 create mode 100644 tests/sjis-mb

>From 86c6e3f59a1dceeba41463bf60889bf66c90e8f4 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <[email protected]>
Date: Mon, 29 Mar 2010 08:54:30 +0200
Subject: [PATCH 1/2] grep -F: fix a bug with SJIS character sets

Commit db9d6 would erroneously skip matches in SJIS character sets.  In
this character set low bytes (i.e. ASCII bytes) are also valid second
bytes in a double-byte character, so you have to continue looking for
a match, even if you match in the middle of a double-byte character.

* src/kwsearch.c: Ensure that beg is advanced by at least one byte,
but do not fail immediately after matching in the middle of a double-byte
character.
---
 src/kwsearch.c |   16 +++++++++++-----
 1 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/kwsearch.c b/src/kwsearch.c
index a20c3a8..3f65c23 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -105,14 +105,20 @@ Fexecute (char const *buf, size_t size, size_t 
*match_size,
        goto failure;
       len = kwsmatch.size[0];
 #ifdef MBS_SUPPORT
-      char const *s0 = mb_start;
       if (MB_CUR_MAX > 1 && is_mb_middle (&mb_start, beg + offset, buf + size,
                                          len))
         {
-         if (mb_start == s0)
-           goto failure;
-          beg = mb_start - 1;
-          continue; /* It is a part of multibyte character.  */
+          /* The match was a part of multibyte character, advance at least
+             one byte to ensure no infinite loop happens.  */
+          mbstate_t s;
+          memset (&s, 0, sizeof s);
+          size_t mb_len = mbrlen (mb_start, (buf + size) - (beg + offset), &s);
+         if (mb_len == (size_t) -2)
+            goto failure;
+          beg = mb_start;
+         if (mb_len != (size_t) -1)
+            beg += mb_len - 1;
+          continue;
         }
 #endif /* MBS_SUPPORT */
       beg += offset;
-- 
1.6.6.1


>From 3147245944353107b9f61226d7e42b26443655fc Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <[email protected]>
Date: Mon, 29 Mar 2010 08:49:42 +0200
Subject: [PATCH 2/2] tests: add tests for SJIS character sets

The attached test will be skipped unless (on a glibc system) you run
something like

  mkdir /usr/lib/locale/ja_JP.SHIFT_JIS
  zcat /usr/share/i18n/charmaps/SHIFT_JIS.gz | \
    localedef \
      -f - \
      -i /usr/share/i18n/locales/ja_JP \
      /usr/lib/locale/ja_JP.SHIFT_JIS

* tests/Makefile.am: Add sjis-mb.
* tests/sjis-mb: New.
---
 tests/Makefile.am |    1 +
 tests/sjis-mb     |   50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 0 deletions(-)
 create mode 100644 tests/sjis-mb

diff --git a/tests/Makefile.am b/tests/Makefile.am
index 3caeb78..6920d21 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -41,6 +41,7 @@ TESTS =                                               \
   pcre.sh                                      \
   pcre-z                                       \
   reversed-range-endpoints                     \
+  sjis-mb                                      \
   spencer1.sh                                  \
   spencer1-locale                              \
   status.sh                                    \
diff --git a/tests/sjis-mb b/tests/sjis-mb
new file mode 100644
index 0000000..f2479ba
--- /dev/null
+++ b/tests/sjis-mb
@@ -0,0 +1,50 @@
+#!/bin/sh
+# similar to euc-mb and fgrep-infloop, but tests SJIS encoding.
+# in this encoding, an ASCII character is both a valid single-byte
+# character, and a suffix of a valid double-byte character
+
+: ${srcdir=.}
+. "$srcdir/init.sh"; path_prepend_ ../src
+
+require_timeout_
+
+# % becomes an half-width katakana in SJIS, and an invalid sequence
+# in UTF-8.  Use this to try skipping implementations that do not
+# support SJIS and treat it as UTF-8.
+encode() { echo "$1" | tr @% '\203\301'; }
+
+seq=0
+test_grep_reject() {
+  seq=`expr $seq + 1`
+  encode "$2" | \
+    LC_ALL=ja_JP.SHIFT_JIS \
+      timeout 10s grep $1 `encode "$3"` > out$seq 2>&1
+  test $? = 1
+}
+
+test_grep() {
+  seq=`expr $seq + 1`
+  encode "$2" > exp$seq
+  LC_ALL=ja_JP.SHIFT_JIS \
+    timeout 10s grep $1 `encode "$3"` exp$seq > out$seq 2>&1
+  test $? = 0 && compare out$seq exp$seq
+}
+
+test_grep_reject -F @@ @ || skip_ 'system does not seem to know about SJIS'
+test_grep -F %%AA A || skip_ 'system seems to treat SJIS the same as UTF-8'
+
+failure_tes...@a
+successful_tests='%%AA @AA @A@@A'
+
+fail=0
+for i in $successful_tests; do
+  test_grep -F $i A || fail=1
+  test_grep -E $i A || fail=1
+done
+
+for i in $failure_tests; do
+  test_grep_reject -F $i A || fail=1
+  test_grep_reject -E $i A || fail=1
+done
+
+Exit $fail
-- 
1.6.6.1



Reply via email to