RRI patches for grep

Aharon Robbins Fri, 27 Apr 2012 02:07:50 -0700

Here are the updated RRI patches for grep. First one is for dfa.c and
doc/grep.texi.  NOT handled is removal of hard-locale.[ch] from lib/ and
from the make infrastructure.


The second patch is for gnulib.  Both are relative to master in both
git repos as of less than an hour ago.

Thanks,

Arnold
------------------
>From 9b16fdee4edf2b4ea8fc4cfc6b6c45bde6ec8cd4 Mon Sep 17 00:00:00 2001
From: Arnold D. Robbins <[email protected]>
Date: Fri, 27 Apr 2012 12:03:16 +0300
Subject: [PATCH] Implement/Document Rational Range Interpretation.

---
 doc/grep.texi |   21 ++++++++++++++++-----
 src/dfa.c     |   40 ++++++----------------------------------
 2 files changed, 22 insertions(+), 39 deletions(-)

diff --git a/doc/grep.texi b/doc/grep.texi
index 000a844..3af72f3 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -958,9 +958,7 @@ They are omitted (i.e., false) by default and become true 
when specified.
 @cindex character type
 @cindex national language support
 @cindex NLS
-These variables specify the locale for the @code{LC_COLLATE} category,
-which determines the collating sequence
-used to interpret range expressions like @samp{[a-z]}.
+These variables specify the locale for the @code{LC_COLLATE} category.
 
 @item LC_ALL
 @itemx LC_CTYPE
@@ -1221,7 +1219,12 @@ For example, the regular expression
 Within a bracket expression, a @dfn{range expression} consists of two
 characters separated by a hyphen.
 It matches any single character that
-sorts between the two characters, inclusive, using the locale's
+sorts between the two characters, inclusive,
+using the machine's character set.
+
+Up to and including version 2.12 of @command{grep},
+range expressions would match any single character that sorted between
+the two characters, inclusive, using the current locale's
 collating sequence and character set.
 For example, in the default C
 locale, @samp{[a-d]} is equivalent to @samp{[abcd]}.
@@ -1230,9 +1233,17 @@ characters in dictionary order, and in these locales 
@samp{[a-d]} is
 typically not equivalent to @samp{[abcd]};
 it might be equivalent to @samp{[aBbCcDd]}, for example.
 To obtain the traditional interpretation
-of bracket expressions, you can use the @samp{C} locale by setting the
+of bracket expressions, it was necessary to use the @samp{C} locale
+by setting the
 @env{LC_ALL} environment variable to the value @samp{C}.
 
+Since the current POSIX standard now makes the behavior of range expressions
+be implementation-defined, instead of requiring the locale's
+collating order, @command{grep} has reverted to the traditional Unix
+behavior of defining ranges based on the machine character set.@footnote{This
+is known as ``Rational Range Interpretation,'' a lovely phrase
+coined by Karl Berry.}
+
 Finally, certain named classes of characters are predefined within
 bracket expressions, as follows.
 Their interpretation depends on the @code{LC_CTYPE} locale;
diff --git a/src/dfa.c b/src/dfa.c
index 1cbe537..c690e10 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -29,6 +29,7 @@
 #include <limits.h>
 #include <string.h>
 #include <locale.h>
+#include <stdbool.h>
 
 #define STREQ(a, b) (strcmp (a, b) == 0)
 
@@ -46,7 +47,7 @@
 #include "gettext.h"
 #define _(str) gettext (str)
 
-#include "mbsupport.h"          /* defines MBS_SUPPORT if appropriate */
+#include "mbsupport.h"          /* defines MBS_SUPPORT to 1 or 0, as 
appropriate */
 #include <wchar.h>
 #include <wctype.h>
 
@@ -56,7 +57,6 @@
 
 #include "regex.h"
 #include "dfa.h"
-#include "hard-locale.h"
 #include "xalloc.h"
 
 /* HPUX, define those as macros in sys/param.h */
@@ -777,7 +777,6 @@ static int laststart;           /* True if we're separated 
from beginning or (,
                                    only by zero-width characters. */
 static size_t parens;           /* Count of outstanding left parens. */
 static int minrep, maxrep;      /* Repeat counts for {m,n}. */
-static int hard_LC_COLLATE;     /* Nonzero if LC_COLLATE is hard.  */
 
 static int cur_mb_len = 1;      /* Length of the multibyte representation of
                                    wctok.  */
@@ -1111,26 +1110,8 @@ parse_bracket_exp (void)
                   c1 = tolower (c1);
                   c2 = tolower (c2);
                 }
-              if (!hard_LC_COLLATE)
-                for (c = c1; c <= c2; c++)
-                  setbit_case_fold_c (c, ccl);
-              else
-                {
-                  /* Defer to the system regex library about the meaning
-                     of range expressions.  */
-                  regex_t re;
-                  char pattern[6] = { '[', c1, '-', c2, ']', 0 };
-                  char subject[2] = { 0, 0 };
-                  regcomp (&re, pattern, REG_NOSUB);
-                  for (c = 0; c < NOTCHAR; ++c)
-                    {
-                      subject[0] = c;
-                      if (!(case_fold && isupper (c))
-                          && regexec (&re, subject, 0, NULL, 0) != REG_NOMATCH)
-                        setbit_case_fold_c (c, ccl);
-                    }
-                  regfree (&re);
-                }
+              for (c = c1; c <= c2; c++)
+                setbit_case_fold_c (c, ccl);
             }
 
           colon_warning_state |= 8;
@@ -1878,9 +1859,6 @@ dfaparse (char const *s, size_t len, struct dfa *d)
   lasttok = END;
   laststart = 1;
   parens = 0;
-#ifdef LC_COLLATE
-  hard_LC_COLLATE = hard_locale (LC_COLLATE);
-#endif
   if (MB_CUR_MAX > 1)
     {
       cur_mb_len = 0;
@@ -2966,7 +2944,6 @@ match_mb_charset (struct dfa *d, state_num s, position 
pos, size_t idx)
                                    with which this operator match.  */
   int op_len;                   /* Length of the operator.  */
   char buffer[128];
-  wchar_t wcbuf[6];
 
   /* Pointer to the structure to which we are currently referring.  */
   struct mb_char_classes *work_mbc;
@@ -3039,16 +3016,11 @@ match_mb_charset (struct dfa *d, state_num s, position 
pos, size_t idx)
         }
     }
 
-  wcbuf[0] = wc;
-  wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
-
   /* match with a range?  */
   for (i = 0; i < work_mbc->nranges; i++)
     {
-      wcbuf[2] = work_mbc->range_sts[i];
-      wcbuf[4] = work_mbc->range_ends[i];
-
-      if (wcscoll (wcbuf, wcbuf + 2) >= 0 && wcscoll (wcbuf + 4, wcbuf) >= 0)
+      if (work_mbc->range_sts[i] <= wc &&
+          wc <= work_mbc->range_ends[i])
         goto charset_matched;
     }
 
-- 
1.7.1


>From 5c7665f2ced46d2e830958bce1bf46469995d3de Mon Sep 17 00:00:00 2001
From: Arnold D. Robbins <[email protected]>
Date: Fri, 27 Apr 2012 12:04:22 +0300
Subject: [PATCH] Implement Rational Range Interpretation in Gnulib.

---
 lib/regcomp.c |   13 +++----------
 lib/regexec.c |   12 ++----------
 2 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/lib/regcomp.c b/lib/regcomp.c
index b51a9a6..7748535 100644
--- a/lib/regcomp.c
+++ b/lib/regcomp.c
@@ -2702,7 +2702,6 @@ build_range_exp (const reg_syntax_t syntax,
     wchar_t wc;
     wint_t start_wc;
     wint_t end_wc;
-    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
 
     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
                : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
@@ -2716,11 +2715,7 @@ build_range_exp (const reg_syntax_t syntax,
              ? __btowc (end_ch) : end_elem->opr.wch);
     if (start_wc == WEOF || end_wc == WEOF)
       return REG_ECOLLATE;
-    cmp_buf[0] = start_wc;
-    cmp_buf[4] = end_wc;
-
-    if (BE ((syntax & RE_NO_EMPTY_RANGES)
-            && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
+    else if ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc)
       return REG_ERANGE;
 
     /* Got valid collation sequence values, add them as a new entry.
@@ -2761,10 +2756,8 @@ build_range_exp (const reg_syntax_t syntax,
     /* Build the table for single byte characters.  */
     for (wc = 0; wc < SBC_MAX; ++wc)
       {
-       cmp_buf[2] = wc;
-       if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
-           && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
-         bitset_set (sbcset, wc);
+         if (start_wc <= wc && wc <= end_wc)
+           bitset_set (sbcset, wc);
       }
   }
 # else /* not RE_ENABLE_I18N */
diff --git a/lib/regexec.c b/lib/regexec.c
index 92efb44..5a6a0dc 100644
--- a/lib/regexec.c
+++ b/lib/regexec.c
@@ -3986,18 +3986,10 @@ check_node_accept_bytes (const re_dfa_t *dfa, Idx 
node_idx,
 # endif /* _LIBC */
        {
          /* match with range expression?  */
-#if __GNUC__ >= 2 && ! (__STDC_VERSION__ < 199901L && defined __STRICT_ANSI__)
-         wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
-#else
-         wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
-         cmp_buf[2] = wc;
-#endif
          for (i = 0; i < cset->nranges; ++i)
            {
-             cmp_buf[0] = cset->range_starts[i];
-             cmp_buf[4] = cset->range_ends[i];
-             if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
-                 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
+              if (cset->range_starts[i] <= wc
+                  && wc <= cset->range_ends[i])
                {
                  match_len = char_len;
                  goto check_node_accept_bytes_match;
-- 
1.7.1

RRI patches for grep

Reply via email to