POSIX doesn't permit an unescaped '/' in an extended regular expression.
Unlike upstream awk, ours has historically allowed unescaped '/'
inside a bracket expression for compatibility with other awk
implementations but the check was too simple-minded.  This improves
the matching to allow things like /[]/]/, /[^]// and '/[abc[:digit:]/@#]/'
which are also accepted by gawk and mawk.
To enable strict POSIX compliance, set POSIXLY_CORRECT.

My goal is to have something acceptable for upstream.  Can anyone
think of a case where this would not do the right thing?

 - todd

Index: usr.bin/awk/awk.1
===================================================================
RCS file: /cvs/src/usr.bin/awk/awk.1,v
retrieving revision 1.50
diff -u -p -u -r1.50 awk.1
--- usr.bin/awk/awk.1   10 Jun 2020 21:05:02 -0000      1.50
+++ usr.bin/awk/awk.1   11 Jun 2020 13:51:58 -0000
@@ -805,7 +805,9 @@ string argument for
 .Fn sub
 and
 .Fn gsub
-are not collapsed.
+are not collapsed and a slash
+.Pq Ql /
+does not need to be escaped in a bracket expression.
 .Pp
 The flags
 .Op Fl \&dV
Index: usr.bin/awk/awk.h
===================================================================
RCS file: /cvs/src/usr.bin/awk/awk.h,v
retrieving revision 1.23
diff -u -p -u -r1.23 awk.h
--- usr.bin/awk/awk.h   10 Jun 2020 21:06:09 -0000      1.23
+++ usr.bin/awk/awk.h   11 Jun 2020 13:51:58 -0000
@@ -62,6 +62,7 @@ extern enum compile_states {
 } compile_time;
 
 extern bool    safe;           /* false => unsafe, true => safe */
+extern bool    do_posix;       /* true if POSIXLY_CORRECT set */
 
 #define        RECSIZE (8 * 1024)      /* sets limit on records, fields, etc., 
etc. */
 extern int     recsize;        /* size of current record, orig RECSIZE */
Index: usr.bin/awk/lex.c
===================================================================
RCS file: /cvs/src/usr.bin/awk/lex.c,v
retrieving revision 1.19
diff -u -p -u -r1.19 lex.c
--- usr.bin/awk/lex.c   10 Jun 2020 21:05:50 -0000      1.19
+++ usr.bin/awk/lex.c   11 Jun 2020 13:51:58 -0000
@@ -524,12 +524,12 @@ int regexpr(void)
        int c, openclass = 0;
        static char *buf = NULL;
        static int bufsz = 500;
-       char *bp;
+       char *bp, *cstart;
 
        if (buf == NULL && (buf = malloc(bufsz)) == NULL)
                FATAL("out of space for rex expr");
        bp = buf;
-       for ( ; ((c = input()) != '/' || openclass == 1) && c != 0; ) {
+       for ( ; ((c = input()) != '/' || openclass > 0) && c != 0; ) {
                if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
                        FATAL("out of space for reg expr %.10s...", buf);
                if (c == '\n') {
@@ -541,10 +541,25 @@ int regexpr(void)
                        *bp++ = '\\';
                        *bp++ = input();
                } else {
-                       if (c == '[')
-                               openclass = 1;
-                       else if (c == ']')
-                               openclass = 0;
+                       /*
+                        * POSIX requires a slash in a regexp to be escaped,
+                        * other awks don't require it to be escaped inside
+                        * a character class.
+                        */
+                       if (!do_posix) {
+                               if (c == '[') {
+                                       if (++openclass == 1)
+                                               cstart = bp;
+                               } else if (c == ']' && openclass > 0) {
+                                       /*
+                                        * A ']' as the first char in a
+                                        * class is treated literally.
+                                        */
+                                       if (cstart != bp - 1 &&
+                                           (cstart != bp - 2 || bp[-1] != '^'))
+                                               openclass--;
+                               }
+                       }
                        *bp++ = c;
                }
        }
Index: usr.bin/awk/main.c
===================================================================
RCS file: /cvs/src/usr.bin/awk/main.c,v
retrieving revision 1.39
diff -u -p -u -r1.39 main.c
--- usr.bin/awk/main.c  11 Jun 2020 13:51:18 -0000      1.39
+++ usr.bin/awk/main.c  11 Jun 2020 13:51:58 -0000
@@ -53,7 +53,8 @@ static size_t maxpfile;       /* max program f
 static size_t  npfile;         /* number of filenames */
 static size_t  curpfile;       /* current filename */
 
-bool   safe = false;   /* true => "safe" mode */
+bool   safe = false;           /* true => "safe" mode */
+bool   do_posix = false;       /* true => POSIX mode */
 
 static noreturn void fpecatch(int n
 #ifdef SA_SIGINFO
@@ -161,6 +162,8 @@ int main(int argc, char *argv[])
 #else
        (void)signal(SIGFPE, fpecatch);
 #endif
+
+       do_posix = (getenv("POSIXLY_CORRECT") != NULL);
 
        yyin = NULL;
        symtab = makesymtab(NSYMTAB);
Index: usr.bin/awk/run.c
===================================================================
RCS file: /cvs/src/usr.bin/awk/run.c,v
retrieving revision 1.57
diff -u -p -u -r1.57 run.c
--- usr.bin/awk/run.c   10 Jun 2020 21:05:50 -0000      1.57
+++ usr.bin/awk/run.c   11 Jun 2020 13:51:58 -0000
@@ -2122,13 +2122,6 @@ void backsub(char **pb_ptr, const char *
 {                                              /* sptr[0] == '\\' */
        char *pb = *pb_ptr;
        const char *sptr = *sptr_ptr;
-       static bool first = true;
-       static bool do_posix = false;
-
-       if (first) {
-               first = false;
-               do_posix = (getenv("POSIXLY_CORRECT") != NULL);
-       }
 
        if (sptr[1] == '\\') {
                if (sptr[2] == '\\' && sptr[3] == '&') { /* \\\& -> \& */

Reply via email to