diff --git a/src/dfa.c b/src/dfa.c index f7453c7..8e59ad2 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -90,6 +90,8 @@ /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */ typedef unsigned int charclass[CHARCLASS_INTS]; +static const char * const wctname[] = { "tolower", "toupper", "totitle", NULL }; + /* Convert a possibly-signed character to an unsigned character. This is a bit safer than casting to unsigned char, since it catches some type errors that the cast doesn't. */ @@ -739,7 +741,18 @@ setbit_case_fold_c (int b, charclass c) return; setbit (b, c); if (case_fold && iswalpha (wc)) - setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c); + { + unsigned int i; + for (i = 0; wctname[i]; i++) + { + wctrans_t wct = wctrans (wctname[i]); + if (!wct) + continue; + wint_t wc2 = towctrans (wc, wct); + if (wc2 != wc) + setbit_wc (wc2, c); + } + } } else { @@ -1089,19 +1102,24 @@ parse_bracket_exp (void) range_sts_al, work_mbc->nranges + 1); REALLOC_IF_NECESSARY (work_mbc->range_ends, range_ends_al, work_mbc->nranges + 1); - work_mbc->range_sts[work_mbc->nranges] = - case_fold ? towlower (wc) : (wchar_t) wc; - work_mbc->range_ends[work_mbc->nranges++] = - case_fold ? towlower (wc2) : (wchar_t) wc2; + work_mbc->range_sts[work_mbc->nranges] = (wchar_t) wc; + work_mbc->range_ends[work_mbc->nranges++] = (wchar_t) wc2; if (case_fold && (iswalpha (wc) || iswalpha (wc2))) { - REALLOC_IF_NECESSARY (work_mbc->range_sts, - range_sts_al, work_mbc->nranges + 1); - work_mbc->range_sts[work_mbc->nranges] = towupper (wc); - REALLOC_IF_NECESSARY (work_mbc->range_ends, - range_ends_al, work_mbc->nranges + 1); - work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2); + unsigned int i; + for (i = 0; wctname[i]; i++) + { + wctrans_t wct = wctrans (wctname[i]); + if (!wct) + continue; + REALLOC_IF_NECESSARY (work_mbc->range_sts, + range_sts_al, work_mbc->nranges + 1); + work_mbc->range_sts[work_mbc->nranges] = (wchar_t) towctrans (wc, wct); + REALLOC_IF_NECESSARY (work_mbc->range_ends, + range_ends_al, work_mbc->nranges + 1); + work_mbc->range_ends[work_mbc->nranges++] = (wchar_t) towctrans (wc2, wct); + } } } else @@ -1129,23 +1147,30 @@ parse_bracket_exp (void) continue; } - if (case_fold && iswalpha (wc)) - { - wc = towlower (wc); - if (!setbit_wc (wc, ccl)) - { - REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, - work_mbc->nchars + 1); - work_mbc->chars[work_mbc->nchars++] = wc; - } - wc = towupper (wc); - } if (!setbit_wc (wc, ccl)) { REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, work_mbc->nchars + 1); work_mbc->chars[work_mbc->nchars++] = wc; } + + if (case_fold && iswalpha (wc)) + { + unsigned int i; + for (i = 0; wctname[i]; i++) + { + wctrans_t wct = wctrans (wctname[i]); + if (!wct) + continue; + wc2 = towctrans (wc, wct); + if (wc2 != wc && !setbit_wc (wc2, ccl)) + { + REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, + work_mbc->nchars + 1); + work_mbc->chars[work_mbc->nchars++] = wc2; + } + } + } } while ((wc = wc1, (c = c1) != ']')); @@ -1731,11 +1756,22 @@ atom (void) } else if (MBS_SUPPORT && tok == WCHAR) { - addtok_wc (case_fold ? towlower (wctok) : wctok); + addtok_wc (wctok); if (case_fold && iswalpha (wctok)) { - addtok_wc (towupper (wctok)); - addtok (OR); + unsigned int i; + for (i = 0; wctname[i]; i++) + { + wctrans_t wct = wctrans (wctname[i]); + if (!wct) + continue; + wint_t wc2 = towctrans (wctok, wct); + if (wc2 != (wint_t) wctok) + { + addtok_wc (wc2); + addtok (OR); + } + } } tok = lex (); diff --git a/src/main.c b/src/main.c index 42f9ff3..03d4d55 100644 --- a/src/main.c +++ b/src/main.c @@ -1873,6 +1873,8 @@ parse_grep_colors (void) (*(s) = wctob ((wint_t) (wc)), 1) : \ wcrtomb ((s), (wc), (ps))) +static const char * const wctname[] = { "tolower", "toupper", "totitle", NULL }; + /* If the newline-separated regular expressions, KEYS (with length, LEN and no trailing NUL byte), are amenable to transformation into otherwise equivalent case-ignoring ones, perform the transformation, @@ -1923,14 +1925,24 @@ trivial_case_ignore (size_t len, char const *keys, memcpy (p, orig, n); p += n; - wchar_t wc2 = iswupper (wc) ? towlower (wc) : towupper (wc); - char buf[MB_CUR_MAX]; - int n2 = WCRTOMB (buf, wc2, &mb_state); - if (n2 <= 0) - goto skip_case_ignore_optimization; - assert (n2 <= MB_CUR_MAX); - memcpy (p, buf, n2); - p += n2; + unsigned int i; + for (i = 0; wctname[i]; i++) + { + wctrans_t wct = wctrans (wctname[i]); + if (!wct) + continue; + wchar_t wc2 = (wchar_t) towctrans (wc, wct); + if (wc2 != wc) + { + char buf[MB_CUR_MAX]; + int n2 = WCRTOMB (buf, wc2, &mb_state); + if (n2 <= 0) + goto skip_case_ignore_optimization; + assert (n2 <= MB_CUR_MAX); + memcpy (p, buf, n2); + p += n2; + } + } *p++ = ']'; } diff --git a/tests/Makefile.am b/tests/Makefile.am index e2967fa..0a3af40 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -47,6 +47,7 @@ TESTS = \ case-fold-char-class \ case-fold-char-range \ case-fold-char-type \ + case-fold-title-case \ char-class-multibyte \ char-class-multibyte2 \ dfa-coverage \ diff --git a/tests/case-fold-title-case b/tests/case-fold-title-case index e69de29..d3acfc8 100755 --- a/tests/case-fold-title-case +++ b/tests/case-fold-title-case @@ -0,0 +1,34 @@ +#!/bin/sh +# This would fail for grep-2.16 +. "${srcdir=.}/init.sh"; path_prepend_ ../src + +printf '\xC7\x87\n' > exp1 || framework_failure_ +printf '\xC7\x88\n' > exp2 || framework_failure_ +printf '\xC7\x89\n' > exp3 || framework_failure_ +fail=0 + +for LOC in en_US.UTF-8 $LOCALE_FR_UTF8; do + out=out1-$LOC + LC_ALL=$LOC grep -i -f exp2 exp1 > $out || fail=1 + compare exp1 $out || fail=1 +done + +for LOC in en_US.UTF-8 $LOCALE_FR_UTF8; do + out=out2-$LOC + LC_ALL=$LOC grep -i -f exp3 exp1 > $out || fail=1 + compare exp1 $out || fail=1 +done + +for LOC in en_US.UTF-8 $LOCALE_FR_UTF8; do + out=out1-$LOC + LC_ALL=$LOC grep -i -f exp1 exp2 > $out || fail=1 + compare exp2 $out || fail=1 +done + +for LOC in en_US.UTF-8 $LOCALE_FR_UTF8; do + out=out2-$LOC + LC_ALL=$LOC grep -i -f exp3 exp2 > $out || fail=1 + compare exp2 $out || fail=1 +done + +Exit $fail