POSIX doesn't permit an unescaped '/' in an extended regular expression.
Unlike upstream awk, ours has historically allowed unescaped '/'
inside a bracket expression for compatibility with other awk
implementations but the check was too simple-minded. This improves
the matching to allow things like /[]/]/, /[^]// and '/[abc[:digit:]/@#]/'
which are also accepted by gawk and mawk.
To enable strict POSIX compliance, set POSIXLY_CORRECT.
My goal is to have something acceptable for upstream. Can anyone
think of a case where this would not do the right thing?
- todd
Index: usr.bin/awk/awk.1
===================================================================
RCS file: /cvs/src/usr.bin/awk/awk.1,v
retrieving revision 1.50
diff -u -p -u -r1.50 awk.1
--- usr.bin/awk/awk.1 10 Jun 2020 21:05:02 -0000 1.50
+++ usr.bin/awk/awk.1 11 Jun 2020 13:51:58 -0000
@@ -805,7 +805,9 @@ string argument for
.Fn sub
and
.Fn gsub
-are not collapsed.
+are not collapsed and a slash
+.Pq Ql /
+does not need to be escaped in a bracket expression.
.Pp
The flags
.Op Fl \&dV
Index: usr.bin/awk/awk.h
===================================================================
RCS file: /cvs/src/usr.bin/awk/awk.h,v
retrieving revision 1.23
diff -u -p -u -r1.23 awk.h
--- usr.bin/awk/awk.h 10 Jun 2020 21:06:09 -0000 1.23
+++ usr.bin/awk/awk.h 11 Jun 2020 13:51:58 -0000
@@ -62,6 +62,7 @@ extern enum compile_states {
} compile_time;
extern bool safe; /* false => unsafe, true => safe */
+extern bool do_posix; /* true if POSIXLY_CORRECT set */
#define RECSIZE (8 * 1024) /* sets limit on records, fields, etc.,
etc. */
extern int recsize; /* size of current record, orig RECSIZE */
Index: usr.bin/awk/lex.c
===================================================================
RCS file: /cvs/src/usr.bin/awk/lex.c,v
retrieving revision 1.19
diff -u -p -u -r1.19 lex.c
--- usr.bin/awk/lex.c 10 Jun 2020 21:05:50 -0000 1.19
+++ usr.bin/awk/lex.c 11 Jun 2020 13:51:58 -0000
@@ -524,12 +524,12 @@ int regexpr(void)
int c, openclass = 0;
static char *buf = NULL;
static int bufsz = 500;
- char *bp;
+ char *bp, *cstart;
if (buf == NULL && (buf = malloc(bufsz)) == NULL)
FATAL("out of space for rex expr");
bp = buf;
- for ( ; ((c = input()) != '/' || openclass == 1) && c != 0; ) {
+ for ( ; ((c = input()) != '/' || openclass > 0) && c != 0; ) {
if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
FATAL("out of space for reg expr %.10s...", buf);
if (c == '\n') {
@@ -541,10 +541,25 @@ int regexpr(void)
*bp++ = '\\';
*bp++ = input();
} else {
- if (c == '[')
- openclass = 1;
- else if (c == ']')
- openclass = 0;
+ /*
+ * POSIX requires a slash in a regexp to be escaped,
+ * other awks don't require it to be escaped inside
+ * a character class.
+ */
+ if (!do_posix) {
+ if (c == '[') {
+ if (++openclass == 1)
+ cstart = bp;
+ } else if (c == ']' && openclass > 0) {
+ /*
+ * A ']' as the first char in a
+ * class is treated literally.
+ */
+ if (cstart != bp - 1 &&
+ (cstart != bp - 2 || bp[-1] != '^'))
+ openclass--;
+ }
+ }
*bp++ = c;
}
}
Index: usr.bin/awk/main.c
===================================================================
RCS file: /cvs/src/usr.bin/awk/main.c,v
retrieving revision 1.39
diff -u -p -u -r1.39 main.c
--- usr.bin/awk/main.c 11 Jun 2020 13:51:18 -0000 1.39
+++ usr.bin/awk/main.c 11 Jun 2020 13:51:58 -0000
@@ -53,7 +53,8 @@ static size_t maxpfile; /* max program f
static size_t npfile; /* number of filenames */
static size_t curpfile; /* current filename */
-bool safe = false; /* true => "safe" mode */
+bool safe = false; /* true => "safe" mode */
+bool do_posix = false; /* true => POSIX mode */
static noreturn void fpecatch(int n
#ifdef SA_SIGINFO
@@ -161,6 +162,8 @@ int main(int argc, char *argv[])
#else
(void)signal(SIGFPE, fpecatch);
#endif
+
+ do_posix = (getenv("POSIXLY_CORRECT") != NULL);
yyin = NULL;
symtab = makesymtab(NSYMTAB);
Index: usr.bin/awk/run.c
===================================================================
RCS file: /cvs/src/usr.bin/awk/run.c,v
retrieving revision 1.57
diff -u -p -u -r1.57 run.c
--- usr.bin/awk/run.c 10 Jun 2020 21:05:50 -0000 1.57
+++ usr.bin/awk/run.c 11 Jun 2020 13:51:58 -0000
@@ -2122,13 +2122,6 @@ void backsub(char **pb_ptr, const char *
{ /* sptr[0] == '\\' */
char *pb = *pb_ptr;
const char *sptr = *sptr_ptr;
- static bool first = true;
- static bool do_posix = false;
-
- if (first) {
- first = false;
- do_posix = (getenv("POSIXLY_CORRECT") != NULL);
- }
if (sptr[1] == '\\') {
if (sptr[2] == '\\' && sptr[3] == '&') { /* \\\& -> \& */