The docs are clear that the only valid use of the experimental
changeword is to pass a regex where all prefixes of a desired
acceptable word are also accepted.  If this constraint is not met, and
the regex can match a longer string but not the one-byte prefix of
that string, then the fastmap of that regex will still be set on that
byte (as it could be a valid anchor to try searching for a longer
match), but when re_search() then fails to match, we are left with
garbage in regs.start.

The docs even had an example where this constraint is not met, but
because of the way the test was written, the first macro parsed after
changeword was "dnl", which populated regs with something that happens
to work on the next attempt to parse with failure to match "f".  But
the test added here demonstrates that without a prior regex match, if
the first byte after changeword is in the fastmap but fails to parse,
then regs.start will still be NULL and crash m4.  And even when m4
didn't crash, it's still better to only rely on regs after re-running
the regex on the largest string that did match, rather than whatever
is left in regs after the first failure to match a one-byte-longer
string.

I _really_ want to get rid of changeword.  It slows things down, and
is a nightmare to maintain.  And the fact that NO ONE reported this
regression introduced in 2008 (because most distros wisely refuse to
build with --enable-changeword) means it won't be missed.  But
removing a feature, even experimental, should be done for 1.6, not
1.4.x.

* NEWS: Document the bug fix.  Also a whitespace fix.
* cfg.mk (old_NEWS_hash): Run 'make update-NEWS-hash'.
* m4.texi (Changeword): Test for the bug.
* input.c (word_start) [ENABLE_CHANGEWORD]: Revert commit cde8ed62,
but this time use a static array rather than malloc'd pointer.
(set_word_regexp): Populate word_start.
(peek_token): Use it.
(next_token): Likewise, and don't use regs.start[1] if regex did not
have \(\) grouping.
---
 NEWS        |  8 ++++++++
 cfg.mk      |  2 +-
 doc/m4.texi | 13 +++++++++++++
 src/input.c | 26 +++++++++++++++++++++++---
 4 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/NEWS b/NEWS
index 6c09605f..c8ce03e4 100644
--- a/NEWS
+++ b/NEWS
@@ -16,12 +16,20 @@ GNU M4 NEWS - User visible changes.
    change belongs in a major version release such as 1.6, and not a minor
    release.

+** Fix regression introduced in 1.4.11 where the experimental `changeword'
+   builtin could cause a crash if given a regex that does not match all
+   one-byte prefixes of valid longer matches.  As a reminder, `changeword'
+   is not recommended for production use, and will likely not be present
+   in the next major version release.
+
+
 * Noteworthy changes in release 1.4.19 (2021-05-28) [stable]

 ** A number of portability improvements inherited from gnulib, including
    the ability to perform stack overflow detection on more platforms
    without linking to GNU libsigsegv.

+
 * Noteworthy changes in release 1.4.18d (2021-05-11) [beta]

 ** A number of portability improvements inherited from gnulib.
diff --git a/cfg.mk b/cfg.mk
index 7f5c01ab..81fdc459 100644
--- a/cfg.mk
+++ b/cfg.mk
@@ -34,7 +34,7 @@ local-checks-to-skip += sc_cast_of_x_alloc_return_value
 config_h_header = "m4\.h"

 # Hash of NEWS contents, to ensure we don't add entries to wrong section.
-old_NEWS_hash = 4e455cf16043061b7ffe289a9b604f33
+old_NEWS_hash = beb0fe1124e1148f05fc64abac57eff5

 # Update m4-latest.tar.* symlinks during 'make stable/beta'.
 GNUPLOADFLAGS = --symlink-regex
diff --git a/doc/m4.texi b/doc/m4.texi
index ed169e68..ef9828cc 100644
--- a/doc/m4.texi
+++ b/doc/m4.texi
@@ -4740,6 +4740,19 @@ Changeword
 @end example

 @ignore
+@comment Expose a core dump introduced in 1.4.11.
+@example
+ifdef(`changeword', `', `errprint(` skipping: no changeword support
+')m4exit(`77')')dnl
+define(`foo
+', `bar
+')
+@result{}
+changeword(`[cd][a-z]*\|foo[
+]')foo
+@result{}foo
+@end example
+
 @comment One more test of including newline in a macro name; but this
 @comment does not need to be displayed in the manual.  This ensures
 @comment that line numbering is correct when dnl cuts across include
diff --git a/src/input.c b/src/input.c
index 754bf4ee..86e2ac1e 100644
--- a/src/input.c
+++ b/src/input.c
@@ -154,6 +154,7 @@ STRING ecomm;

 # define DEFAULT_WORD_REGEXP "[_a-zA-Z][_a-zA-Z0-9]*"

+static char word_start[256];
 static struct re_pattern_buffer word_regexp;
 static int default_word_regexp;
 static struct re_registers regs;
@@ -772,6 +773,8 @@ set_comment (const char *bc, const char *ec)
 void
 set_word_regexp (const char *regexp)
 {
+  int i;
+  char test[2] = "";
   const char *msg;
   struct re_pattern_buffer new_word_regexp;

@@ -806,6 +809,20 @@ set_word_regexp (const char *regexp)
     assert (false);

   default_word_regexp = false;
+
+  /* The fastmap contains any byte that can start a word. But we still
+     need to know which bytes can be matched as a word in isolation
+     (although the documentations requires that all prefixes of a
+     user's desired words to also be matched, this catches when a user
+     did not follow that limitation).  */
+  for (i = 1; i < 256; i++)
+    {
+      if (word_regexp.fastmap[i])
+        {
+          test[0] = i;
+          word_start[i] = re_search (&word_regexp, test, 1, 0, 0, NULL) >= 0;
+        }
+    }
 }

 #endif /* ENABLE_CHANGEWORD */
@@ -897,7 +914,7 @@ next_token (token_data *td, int *line)

 #ifdef ENABLE_CHANGEWORD

-  else if (!default_word_regexp && word_regexp.fastmap[ch])
+  else if (!default_word_regexp && word_start[ch])
     {
       obstack_1grow (&token_stack, ch);
       while (1)
@@ -915,6 +932,9 @@ next_token (token_data *td, int *line)
             {
               *(((char *) obstack_base (&token_stack)
                  + obstack_object_size (&token_stack)) - 1) = '\0';
+              re_search (&word_regexp,
+                         (char *) obstack_base (&token_stack),
+                         obstack_object_size (&token_stack) - 1, 0, 0, &regs);
               break;
             }
           next_char ();
@@ -923,7 +943,7 @@ next_token (token_data *td, int *line)
       obstack_1grow (&token_stack, '\0');
       orig_text = (char *) obstack_finish (&token_stack);

-      if (regs.start[1] != -1)
+      if (regs.num_regs && regs.start[1] != -1)
         obstack_grow (&token_stack, orig_text + regs.start[1],
                       regs.end[1] - regs.start[1]);
       else
@@ -1059,7 +1079,7 @@ peek_token (void)
     }
   else if ((default_word_regexp && (c_isalpha (ch) || ch == '_'))
 #ifdef ENABLE_CHANGEWORD
-           || (!default_word_regexp && word_regexp.fastmap[ch])
+           || (!default_word_regexp && word_start[ch])
 #endif /* ENABLE_CHANGEWORD */
     )
     {
-- 
2.49.0


_______________________________________________
M4-patches mailing list
M4-patches@gnu.org
https://lists.gnu.org/mailman/listinfo/m4-patches

Reply via email to