Here are the updated RRI patches for grep. First one is for dfa.c and doc/grep.texi. NOT handled is removal of hard-locale.[ch] from lib/ and from the make infrastructure.
The second patch is for gnulib. Both are relative to master in both git repos as of less than an hour ago. Thanks, Arnold ------------------ >From 9b16fdee4edf2b4ea8fc4cfc6b6c45bde6ec8cd4 Mon Sep 17 00:00:00 2001 From: Arnold D. Robbins <[email protected]> Date: Fri, 27 Apr 2012 12:03:16 +0300 Subject: [PATCH] Implement/Document Rational Range Interpretation. --- doc/grep.texi | 21 ++++++++++++++++----- src/dfa.c | 40 ++++++---------------------------------- 2 files changed, 22 insertions(+), 39 deletions(-) diff --git a/doc/grep.texi b/doc/grep.texi index 000a844..3af72f3 100644 --- a/doc/grep.texi +++ b/doc/grep.texi @@ -958,9 +958,7 @@ They are omitted (i.e., false) by default and become true when specified. @cindex character type @cindex national language support @cindex NLS -These variables specify the locale for the @code{LC_COLLATE} category, -which determines the collating sequence -used to interpret range expressions like @samp{[a-z]}. +These variables specify the locale for the @code{LC_COLLATE} category. @item LC_ALL @itemx LC_CTYPE @@ -1221,7 +1219,12 @@ For example, the regular expression Within a bracket expression, a @dfn{range expression} consists of two characters separated by a hyphen. It matches any single character that -sorts between the two characters, inclusive, using the locale's +sorts between the two characters, inclusive, +using the machine's character set. + +Up to and including version 2.12 of @command{grep}, +range expressions would match any single character that sorted between +the two characters, inclusive, using the current locale's collating sequence and character set. For example, in the default C locale, @samp{[a-d]} is equivalent to @samp{[abcd]}. @@ -1230,9 +1233,17 @@ characters in dictionary order, and in these locales @samp{[a-d]} is typically not equivalent to @samp{[abcd]}; it might be equivalent to @samp{[aBbCcDd]}, for example. To obtain the traditional interpretation -of bracket expressions, you can use the @samp{C} locale by setting the +of bracket expressions, it was necessary to use the @samp{C} locale +by setting the @env{LC_ALL} environment variable to the value @samp{C}. +Since the current POSIX standard now makes the behavior of range expressions +be implementation-defined, instead of requiring the locale's +collating order, @command{grep} has reverted to the traditional Unix +behavior of defining ranges based on the machine character set.@footnote{This +is known as ``Rational Range Interpretation,'' a lovely phrase +coined by Karl Berry.} + Finally, certain named classes of characters are predefined within bracket expressions, as follows. Their interpretation depends on the @code{LC_CTYPE} locale; diff --git a/src/dfa.c b/src/dfa.c index 1cbe537..c690e10 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -29,6 +29,7 @@ #include <limits.h> #include <string.h> #include <locale.h> +#include <stdbool.h> #define STREQ(a, b) (strcmp (a, b) == 0) @@ -46,7 +47,7 @@ #include "gettext.h" #define _(str) gettext (str) -#include "mbsupport.h" /* defines MBS_SUPPORT if appropriate */ +#include "mbsupport.h" /* defines MBS_SUPPORT to 1 or 0, as appropriate */ #include <wchar.h> #include <wctype.h> @@ -56,7 +57,6 @@ #include "regex.h" #include "dfa.h" -#include "hard-locale.h" #include "xalloc.h" /* HPUX, define those as macros in sys/param.h */ @@ -777,7 +777,6 @@ static int laststart; /* True if we're separated from beginning or (, only by zero-width characters. */ static size_t parens; /* Count of outstanding left parens. */ static int minrep, maxrep; /* Repeat counts for {m,n}. */ -static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */ static int cur_mb_len = 1; /* Length of the multibyte representation of wctok. */ @@ -1111,26 +1110,8 @@ parse_bracket_exp (void) c1 = tolower (c1); c2 = tolower (c2); } - if (!hard_LC_COLLATE) - for (c = c1; c <= c2; c++) - setbit_case_fold_c (c, ccl); - else - { - /* Defer to the system regex library about the meaning - of range expressions. */ - regex_t re; - char pattern[6] = { '[', c1, '-', c2, ']', 0 }; - char subject[2] = { 0, 0 }; - regcomp (&re, pattern, REG_NOSUB); - for (c = 0; c < NOTCHAR; ++c) - { - subject[0] = c; - if (!(case_fold && isupper (c)) - && regexec (&re, subject, 0, NULL, 0) != REG_NOMATCH) - setbit_case_fold_c (c, ccl); - } - regfree (&re); - } + for (c = c1; c <= c2; c++) + setbit_case_fold_c (c, ccl); } colon_warning_state |= 8; @@ -1878,9 +1859,6 @@ dfaparse (char const *s, size_t len, struct dfa *d) lasttok = END; laststart = 1; parens = 0; -#ifdef LC_COLLATE - hard_LC_COLLATE = hard_locale (LC_COLLATE); -#endif if (MB_CUR_MAX > 1) { cur_mb_len = 0; @@ -2966,7 +2944,6 @@ match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx) with which this operator match. */ int op_len; /* Length of the operator. */ char buffer[128]; - wchar_t wcbuf[6]; /* Pointer to the structure to which we are currently referring. */ struct mb_char_classes *work_mbc; @@ -3039,16 +3016,11 @@ match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx) } } - wcbuf[0] = wc; - wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0'; - /* match with a range? */ for (i = 0; i < work_mbc->nranges; i++) { - wcbuf[2] = work_mbc->range_sts[i]; - wcbuf[4] = work_mbc->range_ends[i]; - - if (wcscoll (wcbuf, wcbuf + 2) >= 0 && wcscoll (wcbuf + 4, wcbuf) >= 0) + if (work_mbc->range_sts[i] <= wc && + wc <= work_mbc->range_ends[i]) goto charset_matched; } -- 1.7.1 >From 5c7665f2ced46d2e830958bce1bf46469995d3de Mon Sep 17 00:00:00 2001 From: Arnold D. Robbins <[email protected]> Date: Fri, 27 Apr 2012 12:04:22 +0300 Subject: [PATCH] Implement Rational Range Interpretation in Gnulib. --- lib/regcomp.c | 13 +++---------- lib/regexec.c | 12 ++---------- 2 files changed, 5 insertions(+), 20 deletions(-) diff --git a/lib/regcomp.c b/lib/regcomp.c index b51a9a6..7748535 100644 --- a/lib/regcomp.c +++ b/lib/regcomp.c @@ -2702,7 +2702,6 @@ build_range_exp (const reg_syntax_t syntax, wchar_t wc; wint_t start_wc; wint_t end_wc; - wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] @@ -2716,11 +2715,7 @@ build_range_exp (const reg_syntax_t syntax, ? __btowc (end_ch) : end_elem->opr.wch); if (start_wc == WEOF || end_wc == WEOF) return REG_ECOLLATE; - cmp_buf[0] = start_wc; - cmp_buf[4] = end_wc; - - if (BE ((syntax & RE_NO_EMPTY_RANGES) - && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0)) + else if ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc) return REG_ERANGE; /* Got valid collation sequence values, add them as a new entry. @@ -2761,10 +2756,8 @@ build_range_exp (const reg_syntax_t syntax, /* Build the table for single byte characters. */ for (wc = 0; wc < SBC_MAX; ++wc) { - cmp_buf[2] = wc; - if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 - && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) - bitset_set (sbcset, wc); + if (start_wc <= wc && wc <= end_wc) + bitset_set (sbcset, wc); } } # else /* not RE_ENABLE_I18N */ diff --git a/lib/regexec.c b/lib/regexec.c index 92efb44..5a6a0dc 100644 --- a/lib/regexec.c +++ b/lib/regexec.c @@ -3986,18 +3986,10 @@ check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx, # endif /* _LIBC */ { /* match with range expression? */ -#if __GNUC__ >= 2 && ! (__STDC_VERSION__ < 199901L && defined __STRICT_ANSI__) - wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'}; -#else - wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; - cmp_buf[2] = wc; -#endif for (i = 0; i < cset->nranges; ++i) { - cmp_buf[0] = cset->range_starts[i]; - cmp_buf[4] = cset->range_ends[i]; - if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 - && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) + if (cset->range_starts[i] <= wc + && wc <= cset->range_ends[i]) { match_len = char_len; goto check_node_accept_bytes_match; -- 1.7.1
