Thare is different behaviour for a invalid sequence between KWset and DFA.
encode() { echo "$1" | tr ABC '\357\274\241'; }
encode ABC | env LC_ALL=en_US.utf8 src/grep "$(encode A)\|q"
encode ABC | env LC_ALL=en_US.utf8 src/grep -F "$(encode A)"
encode sABC | env LC_ALL=en_US.utf8 src/grep "a$(encode A)\|q"
encode sABC | env LC_ALL=en_US.utf8 src/grep -F "a$(encode A)"
We expect that all of them are same results, but only 4th returns 1 row.
This patch fixes it, changes all into 1 row returned.
Norihiro
From 0b5084286c23e75139cc09e02c1ad8495059eb38 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Wed, 30 Apr 2014 11:22:27 +0900
Subject: [PATCH] grep: fix the different behaviour for a invalid sequence
between KWset and DFA
* src/dfa.c (ctok): Define new global variable.
(dfambcache): Don't cache invalid sequences, because it can't be
expressed with any wide character.
(mbs_to_wchar): Return WEOF for invalid sequences.
(parse_bracket_exp): Fix it.
(lex): Set `ctok'.
(atom, match_anychar, match_mb_charset): Fix it.
* src/searchutils.c (is_mb_middle): Fix it.
* tests/prefix-of-multibyte: Fix it.
---
src/dfa.c | 101 ++++++++++++++++++++++++++++------------------
src/kwsearch.c | 7 +---
src/searchutils.c | 2 +-
tests/prefix-of-multibyte | 12 ++++--
4 files changed, 72 insertions(+), 50 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 362de2c..c83a940 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -456,9 +456,13 @@ dfambcache (struct dfa *d)
wint_t wi;
switch (mbrtowc (&wc, &c, 1, &s))
{
- default: wi = wc; break;
- case (size_t) -2: wi = WEOF; break;
- case (size_t) -1: wi = uc; break;
+ default:
+ wi = wc;
+ break;
+ case (size_t) -1:
+ case (size_t) -2:
+ wi = WEOF;
+ break;
}
d->mbrtowc_cache[uc] = wi;
}
@@ -492,7 +496,6 @@ mbs_to_wchar (wchar_t *pwc, char const *s, size_t n, struct
dfa *d)
if (0 < nbytes && nbytes < (size_t) -2)
return nbytes;
memset (&d->mbs, 0, sizeof d->mbs);
- wc = uc;
}
*pwc = wc;
@@ -847,6 +850,8 @@ static int cur_mb_len = 1; /* Length of the multibyte
representation of
/* These variables are used only if (MB_CUR_MAX > 1). */
static wchar_t wctok; /* Wide character representation of the current
multibyte character. */
+static unsigned int ctok; /* Single character representation of the
current
+ multibyte character. */
/* Note that characters become unsigned here. */
@@ -1128,19 +1133,22 @@ parse_bracket_exp (void)
to the pair of ranges, [m-z] [M-Z]. Although this code
is wrong in multiple ways, it's never used in practice.
FIXME: Remove this (and related) unused code. */
- work_mbc->ranges
- = maybe_realloc (work_mbc->ranges, work_mbc->nranges + 2,
- &ranges_al, sizeof *work_mbc->ranges);
- work_mbc->ranges[work_mbc->nranges].beg
- = case_fold ? towlower (wc) : wc;
- work_mbc->ranges[work_mbc->nranges++].end
- = case_fold ? towlower (wc2) : wc2;
-
- if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
+ if (wc != WEOF && wc2 != WEOF)
{
- work_mbc->ranges[work_mbc->nranges].beg = towupper (wc);
+ work_mbc->ranges
+ = maybe_realloc (work_mbc->ranges, work_mbc->nranges +
2,
+ &ranges_al, sizeof *work_mbc->ranges);
+ work_mbc->ranges[work_mbc->nranges].beg
+ = case_fold ? towlower (wc) : wc;
work_mbc->ranges[work_mbc->nranges++].end
- = towupper (wc2);
+ = case_fold ? towlower (wc2) : wc2;
+
+ if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
+ {
+ work_mbc->ranges[work_mbc->nranges].beg = towupper
(wc);
+ work_mbc->ranges[work_mbc->nranges++].end
+ = towupper (wc2);
+ }
}
}
else if (using_simple_locale ())
@@ -1184,23 +1192,28 @@ parse_bracket_exp (void)
continue;
}
- if (case_fold)
- {
- wchar_t folded[CASE_FOLDED_BUFSIZE];
- int i, n = case_folded_counterparts (wc, folded);
- work_mbc->chars = maybe_realloc (work_mbc->chars,
- work_mbc->nchars + n, &chars_al,
- sizeof *work_mbc->chars);
- for (i = 0; i < n; i++)
- if (!setbit_wc (folded[i], ccl))
- work_mbc->chars[work_mbc->nchars++] = folded[i];
- }
- if (!setbit_wc (wc, ccl))
+ if (wc != WEOF)
{
- work_mbc->chars = maybe_realloc (work_mbc->chars, work_mbc->nchars,
- &chars_al, sizeof *work_mbc->chars);
- work_mbc->chars[work_mbc->nchars++] = wc;
+ if (case_fold)
+ {
+ wchar_t folded[CASE_FOLDED_BUFSIZE];
+ int i, n = case_folded_counterparts (wc, folded);
+ work_mbc->chars = maybe_realloc (work_mbc->chars,
+ work_mbc->nchars + n, &chars_al,
+ sizeof *work_mbc->chars);
+ for (i = 0; i < n; i++)
+ if (!setbit_wc (folded[i], ccl))
+ work_mbc->chars[work_mbc->nchars++] = folded[i];
+ }
+ else if (!setbit_wc (wc, ccl))
+ {
+ work_mbc->chars = maybe_realloc (work_mbc->chars,
work_mbc->nchars,
+ &chars_al, sizeof
*work_mbc->chars);
+ work_mbc->chars[work_mbc->nchars++] = wc;
+ }
}
+ else
+ setbit (c, ccl);
}
while ((wc = wc1, (c = c1) != ']'));
@@ -1245,7 +1258,8 @@ lex (void)
"if (backslash) ...". */
for (i = 0; i < 2; ++i)
{
- FETCH_WC (c, wctok, NULL);
+ FETCH_WC (ctok, wctok, NULL);
+ c = ctok;
if (c == (unsigned int) EOF)
goto normal_char;
@@ -1776,18 +1790,23 @@ atom (void)
{
if (tok == WCHAR)
{
- addtok_wc (wctok);
-
- if (case_fold)
+ if (wctok != WEOF)
{
- wchar_t folded[CASE_FOLDED_BUFSIZE];
- int i, n = case_folded_counterparts (wctok, folded);
- for (i = 0; i < n; i++)
+ addtok_wc (wctok);
+
+ if (case_fold)
{
- addtok_wc (folded[i]);
- addtok (OR);
+ wchar_t folded[CASE_FOLDED_BUFSIZE];
+ int i, n = case_folded_counterparts (wctok, folded);
+ for (i = 0; i < n; i++)
+ {
+ addtok_wc (folded[i]);
+ addtok (OR);
+ }
}
}
+ else
+ addtok_mb (ctok, 3);
tok = lex ();
}
@@ -2949,6 +2968,8 @@ match_anychar (struct dfa *d, state_num s, position pos,
if (syntax_bits & RE_DOT_NOT_NULL)
return 0;
}
+ else if (wc == WEOF)
+ return 0;
context = wchar_context (wc);
if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context))
@@ -2985,6 +3006,8 @@ match_mb_charset (struct dfa *d, state_num s, position
pos,
if (syntax_bits & RE_DOT_NOT_NULL)
return 0;
}
+ else if (wc == WEOF)
+ return 0;
context = wchar_context (wc);
if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context))
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 7c64c86..46569e9 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -131,12 +131,7 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
{
/* The match was a part of multibyte character, advance at least
one byte to ensure no infinite loop happens. */
- mbstate_t s;
- memset (&s, 0, sizeof s);
- size_t mb_len = mbrlen (mb_start, (buf + size) - (beg + offset), &s);
- if (mb_len == (size_t) -2 || mb_len == (size_t) -1)
- goto failure;
- beg = mb_start + mb_len - 1;
+ beg = mb_start;
continue;
}
beg += offset;
diff --git a/src/searchutils.c b/src/searchutils.c
index 6440f07..ea26a70 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -270,5 +270,5 @@ is_mb_middle (const char **good, const char *buf, const
char *end,
return true;
/* P == BUF here. */
- return 0 < match_len && match_len < mbrlen (p, end - p, &cur_state);
+ return false;
}
diff --git a/tests/prefix-of-multibyte b/tests/prefix-of-multibyte
index b15fa9b..70a924e 100755
--- a/tests/prefix-of-multibyte
+++ b/tests/prefix-of-multibyte
@@ -1,5 +1,5 @@
#!/bin/sh
-# This would mistakenly print a line prior to grep-2.6.2.
+# This would mistakenly print a line prior to grep-2.18.
. "${srcdir=.}/init.sh"; path_prepend_ ../src
require_en_utf8_locale_
@@ -7,14 +7,18 @@ require_compiled_in_MB_support
encode() { echo "$1" | tr ABC '\357\274\241'; }
+encode ABC >exp1
+encode aABC >exp2
+
fail=0
for LOC in en_US.UTF-8 $LOCALE_FR_UTF8; do
for opt in '' '-F'; do
out=out-$opt-$LOC
- encode ABC | LC_ALL=$LOC grep $opt "$(encode A)" > $out 2>&1
- test $? = 1 || fail=1
- compare /dev/null $out || fail=1
+ LC_ALL=$LOC grep $opt "$(encode A)" exp1 >$out || fail=1
+ compare exp1 $out || fail=1
+ LC_ALL=$LOC grep $opt "$(encode aA)" exp2 >$out || fail=1
+ compare exp2 $out || fail=1
done
done
--
1.9.2