I like that as far as it goes, but it pulls loose a thread that has been
nagging me for a while. How about the attached instead? It includes
somewhat more simplification, entailing more-efficient handling of
caseless letters when ignoring case.
>From 85efede266be9d2cda8d229c012828b6ae4574c5 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Tue, 28 Jan 2014 13:47:47 -0800
Subject: [PATCH] Simplify handling of letter case.
* src/dfa.c (setbit_wc, setbit_case_fold_c, atom): Simplify.
(setbit_case_fold_c, parse_bracket_exp, lex, atom): Invoke tolower
and toupper instead of isalpha followed by one or the other, and
similarly for towlower, towupper, iswalpha. This should lead to
more-efficient handling of caseless letters, and it simplifies
the code.
---
src/dfa.c | 93 ++++++++++++++++++++++++++++++---------------------------------
1 file changed, 44 insertions(+), 49 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index b79c604..72beed0 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -693,39 +693,24 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
this may happen when folding case in weird Turkish locales where
dotless i/dotted I are not included in the chosen character set.
Return whether a bit was set in the charclass. */
-#if MBS_SUPPORT
static bool
setbit_wc (wint_t wc, charclass c)
{
+#if MBS_SUPPORT
int b = wctob (wc);
if (b == EOF)
return false;
setbit (b, c);
return true;
-}
-
-/* Set a bit in the charclass for the given single byte character,
- if it is valid in the current character set. */
-static void
-setbit_c (int b, charclass c)
-{
- /* Do nothing if b is invalid in this character set. */
- if (MB_CUR_MAX > 1 && btowc (b) == WEOF)
- return;
- setbit (b, c);
-}
#else
-# define setbit_c setbit
-static inline bool
-setbit_wc (wint_t wc, charclass c)
-{
abort ();
/*NOTREACHED*/ return false;
-}
#endif
+}
-/* Like setbit_c, but if case is folded, set both cases of a letter. For
+/* Like setbit_wc but for a single-byte character B; and if case is
+ folded, set both cases of a letter. For
MB_CUR_MAX > 1, the resulting charset is only used as an optimization,
and the caller takes care of setting the appropriate field of struct
mb_char_classes. */
@@ -737,16 +722,16 @@ setbit_case_fold_c (int b, charclass c)
wint_t wc = btowc (b);
if (wc == WEOF)
return;
- setbit (b, c);
- if (case_fold && iswalpha (wc))
- setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c);
+ if (case_fold)
+ setbit_wc (wc ^ towlower (wc) ^ towupper (wc), c);
}
else
{
- setbit (b, c);
- if (case_fold && isalpha (b))
- setbit_c (isupper (b) ? tolower (b) : toupper (b), c);
+ if (case_fold)
+ setbit (b ^ tolower (b) ^ toupper (b), c);
}
+
+ setbit (b, c);
}
@@ -1085,23 +1070,30 @@ parse_bracket_exp (void)
{
/* When case folding map a range, say [m-z] (or even [M-z])
to the pair of ranges, [m-z] [M-Z]. */
+ wchar_t lo1 = wc, hi1 = wc2, lo2 = wc, hi2 = wc2;
+ if (case_fold)
+ {
+ lo1 = towlower (lo1);
+ hi1 = towlower (hi1);
+ lo2 = towupper (lo2);
+ hi2 = towupper (hi2);
+ }
+
REALLOC_IF_NECESSARY (work_mbc->range_sts,
range_sts_al, work_mbc->nranges + 1);
REALLOC_IF_NECESSARY (work_mbc->range_ends,
range_ends_al, work_mbc->nranges + 1);
- work_mbc->range_sts[work_mbc->nranges] =
- case_fold ? towlower (wc) : (wchar_t) wc;
- work_mbc->range_ends[work_mbc->nranges++] =
- case_fold ? towlower (wc2) : (wchar_t) wc2;
+ work_mbc->range_sts[work_mbc->nranges] = lo1;
+ work_mbc->range_ends[work_mbc->nranges++] = hi1;
- if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
+ if (lo1 != lo2 || hi1 != hi2)
{
REALLOC_IF_NECESSARY (work_mbc->range_sts,
range_sts_al, work_mbc->nranges + 1);
- work_mbc->range_sts[work_mbc->nranges] = towupper (wc);
+ work_mbc->range_sts[work_mbc->nranges] = lo2;
REALLOC_IF_NECESSARY (work_mbc->range_ends,
range_ends_al, work_mbc->nranges + 1);
- work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
+ work_mbc->range_ends[work_mbc->nranges++] = hi2;
}
}
else
@@ -1129,16 +1121,18 @@ parse_bracket_exp (void)
continue;
}
- if (case_fold && iswalpha (wc))
+ if (case_fold)
{
- wc = towlower (wc);
- if (!setbit_wc (wc, ccl))
+ wchar_t diff = towlower (wc) ^ towupper (wc);
+ if (diff)
{
- REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
- work_mbc->nchars + 1);
- work_mbc->chars[work_mbc->nchars++] = wc;
+ if (!setbit_wc (wc ^ diff, ccl))
+ {
+ REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
+ work_mbc->nchars + 1);
+ work_mbc->chars[work_mbc->nchars++] = wc ^ diff;
+ }
}
- wc = towupper (wc);
}
if (!setbit_wc (wc, ccl))
{
@@ -1481,7 +1475,7 @@ lex (void)
if (MB_CUR_MAX > 1)
return lasttok = WCHAR;
- if (case_fold && isalpha (c))
+ if (case_fold && tolower (c) != toupper (c))
{
zeroset (ccl);
setbit_case_fold_c (c, ccl);
@@ -1725,17 +1719,18 @@ add_utf8_anychar (void)
static void
atom (void)
{
- if (0)
- {
- /* empty */
- }
- else if (MBS_SUPPORT && tok == WCHAR)
+ if (MBS_SUPPORT && tok == WCHAR)
{
- addtok_wc (case_fold ? towlower (wctok) : wctok);
- if (case_fold && iswalpha (wctok))
+ wchar_t wc = wctok;
+ addtok_wc (wc);
+ if (case_fold)
{
- addtok_wc (towupper (wctok));
- addtok (OR);
+ wchar_t diff = towlower (wc) ^ towupper (wc);
+ if (diff)
+ {
+ addtok_wc (wc ^ diff);
+ addtok (OR);
+ }
}
tok = lex ();
--
1.8.5.3