Paolo Bonzini wrote: > Jim's fix for the fgrep infinite loop would erroneously miss matches > in SJIS character sets. In this character set low bytes (i.e. ASCII > bytes) are also valid second bytes in a double-byte character, so you > have to continue looking for a match, even if you match in the middle > of a double-byte character.
Good catch! Thank you. > The attached test will be skipped unless (on a glibc system) you run > something like > > mkdir /usr/lib/locale/ja_JP.SHIFT_JIS > zcat /usr/share/i18n/charmaps/SHIFT_JIS.gz | \ > localedef \ > -f - \ > -i /usr/share/i18n/locales/ja_JP \ > /usr/lib/locale/ja_JP.SHIFT_JIS It is telling that when you run those commands, you see this diagnostic: character map `SHIFT_JIS' is not ASCII compatible, locale not ISO C compliant > I verified that it fails before applying the first patch, and passes > with it. > > Paolo Bonzini (2): > grep -F: fix a bug with SJIS character sets > tests: add tests for SJIS character sets > > src/kwsearch.c | 16 +++++++++++----- > tests/Makefile.am | 1 + > tests/sjis-mb | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 62 insertions(+), 5 deletions(-) > create mode 100644 tests/sjis-mb > >>From 86c6e3f59a1dceeba41463bf60889bf66c90e8f4 Mon Sep 17 00:00:00 2001 > From: Paolo Bonzini <[email protected]> > Date: Mon, 29 Mar 2010 08:54:30 +0200 > Subject: [PATCH 1/2] grep -F: fix a bug with SJIS character sets > > Commit db9d6 would erroneously skip matches in SJIS character sets. In > this character set low bytes (i.e. ASCII bytes) are also valid second > bytes in a double-byte character, so you have to continue looking for > a match, even if you match in the middle of a double-byte character. > > * src/kwsearch.c: Ensure that beg is advanced by at least one byte, > but do not fail immediately after matching in the middle of a double-byte > character. > --- > src/kwsearch.c | 16 +++++++++++----- > 1 files changed, 11 insertions(+), 5 deletions(-) > > diff --git a/src/kwsearch.c b/src/kwsearch.c > index a20c3a8..3f65c23 100644 > --- a/src/kwsearch.c > +++ b/src/kwsearch.c > @@ -105,14 +105,20 @@ Fexecute (char const *buf, size_t size, size_t > *match_size, > goto failure; > len = kwsmatch.size[0]; > #ifdef MBS_SUPPORT > - char const *s0 = mb_start; > if (MB_CUR_MAX > 1 && is_mb_middle (&mb_start, beg + offset, buf + > size, > len)) > { > - if (mb_start == s0) > - goto failure; > - beg = mb_start - 1; > - continue; /* It is a part of multibyte character. */ > + /* The match was a part of multibyte character, advance at least > + one byte to ensure no infinite loop happens. */ > + mbstate_t s; > + memset (&s, 0, sizeof s); > + size_t mb_len = mbrlen (mb_start, (buf + size) - (beg + offset), > &s); > + if (mb_len == (size_t) -2) > + goto failure; > + beg = mb_start; > + if (mb_len != (size_t) -1) > + beg += mb_len - 1; > + continue; Your fix does indeed work as advertised. However, the mixed-sp+TAB indentation above is ugly (yes, we will fix things asap, once the pace of bug fixes decreases) > } > #endif /* MBS_SUPPORT */ > beg += offset; > -- > 1.6.6.1 > > >>From 3147245944353107b9f61226d7e42b26443655fc Mon Sep 17 00:00:00 2001 > From: Paolo Bonzini <[email protected]> > Date: Mon, 29 Mar 2010 08:49:42 +0200 > Subject: [PATCH 2/2] tests: add tests for SJIS character sets > > The attached test will be skipped unless (on a glibc system) you run > something like > > mkdir /usr/lib/locale/ja_JP.SHIFT_JIS > zcat /usr/share/i18n/charmaps/SHIFT_JIS.gz | \ > localedef \ > -f - \ > -i /usr/share/i18n/locales/ja_JP \ > /usr/lib/locale/ja_JP.SHIFT_JIS > > * tests/Makefile.am: Add sjis-mb. > * tests/sjis-mb: New. > --- > tests/Makefile.am | 1 + > tests/sjis-mb | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 51 insertions(+), 0 deletions(-) > create mode 100644 tests/sjis-mb > > diff --git a/tests/Makefile.am b/tests/Makefile.am > index 3caeb78..6920d21 100644 > --- a/tests/Makefile.am > +++ b/tests/Makefile.am > @@ -41,6 +41,7 @@ TESTS = \ > pcre.sh \ > pcre-z \ > reversed-range-endpoints \ > + sjis-mb \ > spencer1.sh \ > spencer1-locale \ > status.sh \ > diff --git a/tests/sjis-mb b/tests/sjis-mb > new file mode 100644 > index 0000000..f2479ba > --- /dev/null > +++ b/tests/sjis-mb > @@ -0,0 +1,50 @@ > +#!/bin/sh > +# similar to euc-mb and fgrep-infloop, but tests SJIS encoding. > +# in this encoding, an ASCII character is both a valid single-byte > +# character, and a suffix of a valid double-byte character > + > +: ${srcdir=.} > +. "$srcdir/init.sh"; path_prepend_ ../src > + > +require_timeout_ > + > +# % becomes an half-width katakana in SJIS, and an invalid sequence s/an/a/ > +# in UTF-8. Use this to try skipping implementations that do not > +# support SJIS and treat it as UTF-8. > +encode() { echo "$1" | tr @% '\203\301'; } > + > +seq=0 I find s/seq/k/ to be slightly more readable. > +test_grep_reject() { > + seq=`expr $seq + 1` > + encode "$2" | \ > + LC_ALL=ja_JP.SHIFT_JIS \ > + timeout 10s grep $1 `encode "$3"` > out$seq 2>&1 Please use $(...), rather than `...` in tests. init.sh ensures that the shell we are using is capable enough. > + test $? = 1 > +} > + > +test_grep() { > + seq=`expr $seq + 1` > + encode "$2" > exp$seq > + LC_ALL=ja_JP.SHIFT_JIS \ > + timeout 10s grep $1 `encode "$3"` exp$seq > out$seq 2>&1 > + test $? = 0 && compare out$seq exp$seq > +} > + > +test_grep_reject -F @@ @ || skip_ 'system does not seem to know about SJIS' > +test_grep -F %%AA A || skip_ 'system seems to treat SJIS the same as UTF-8' Nice. > +failure_tes...@a > +successful_tests='%%AA @AA @A@@A' > + > +fail=0 > +for i in $successful_tests; do > + test_grep -F $i A || fail=1 > + test_grep -E $i A || fail=1 > +done > + > +for i in $failure_tests; do > + test_grep_reject -F $i A || fail=1 > + test_grep_reject -E $i A || fail=1 > +done > + > +Exit $fail
