On 3/26/21 3:21 PM, Paul Eggert wrote:
The -S code could use some more fixes in this area too - it can probably still dump core on platforms like the Hurd that don't limit exec arg size - but one thing at a time.

I fixed the (unlikely) bugs I found in this area by installing the attached.

>From e3766c5db176ca7abbb8212d5b0b7862fb98a5be Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Mon, 29 Mar 2021 21:42:44 -0700
Subject: [PATCH] env: simplify --split-string memory management

* bootstrap.conf (gnulib_modules): Add idx.
* src/env.c: Include idx.h, minmax.h.
Prefer idx_t to ptrdiff_t when values are nonnegative.
(valid_escape_sequence, escape_char, validate_split_str)
(CHECK_START_NEW_ARG):
Remove; no longer needed now that we validate as we go.
(struct splitbuf): New type.
(splitbuf_grow, splitbuf_append_byte, check_start_new_arg)
(splitbuf_finishup): New functions.
(build_argv): New arg ARGC.  Validate and process in one go, using
the new functions; this is simpler and more reliable than the old
approach (as witness the recent bug).  Avoid integer overflow in
the unlikely case where the string contains more than INT_MAX
arguments.
(parse_split_string): Simplify by exploiting the new build_argv.
---
 bootstrap.conf |   1 +
 src/env.c      | 385 ++++++++++++++++++++++---------------------------
 2 files changed, 173 insertions(+), 213 deletions(-)

diff --git a/bootstrap.conf b/bootstrap.conf
index ab6b3ef0c..f55da99db 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -134,6 +134,7 @@ gnulib_modules="
   host-os
   human
   idcache
+  idx
   ignore-value
   inttostr
   inttypes
diff --git a/src/env.c b/src/env.c
index 11db374d9..e2ab39fd5 100644
--- a/src/env.c
+++ b/src/env.c
@@ -26,6 +26,8 @@
 #include "system.h"
 #include "die.h"
 #include "error.h"
+#include "idx.h"
+#include "minmax.h"
 #include "operand2sig.h"
 #include "quote.h"
 #include "sig2str.h"
@@ -41,14 +43,14 @@
 /* Array of envvars to unset.  */
 static const char **usvars;
 static size_t usvars_alloc;
-static ptrdiff_t usvars_used;
+static idx_t usvars_used;
 
 /* Annotate the output with extra info to aid the user.  */
 static bool dev_debug;
 
 /* Buffer and length of extracted envvars in -S strings.  */
 static char *varname;
-static ptrdiff_t vnlen;
+static idx_t vnlen;
 
 /* Possible actions on each signal.  */
 enum SIGNAL_MODE {
@@ -175,7 +177,7 @@ append_unset_var (const char *var)
 static void
 unset_envvars (void)
 {
-  for (ptrdiff_t i = 0; i < usvars_used; ++i)
+  for (idx_t i = 0; i < usvars_used; ++i)
     {
       devmsg ("unset:    %s\n", usvars[i]);
 
@@ -190,29 +192,6 @@ unset_envvars (void)
   IF_LINT (usvars_alloc = 0);
 }
 
-static bool _GL_ATTRIBUTE_PURE
-valid_escape_sequence (const char c)
-{
-  return (c == 'c' || c == 'f' || c == 'n' || c == 'r' || c == 't' || c == 'v' \
-          || c == '#' || c == '$' || c == '_' || c == '"' || c == '\'' \
-          || c == '\\');
-}
-
-static char _GL_ATTRIBUTE_PURE
-escape_char (const char c)
-{
-  switch (c)
-    {
-    /* \a, \b not supported by FreeBSD's env.  */
-    case 'f': return '\f';
-    case 'n': return '\n';
-    case 'r': return '\r';
-    case 't': return '\t';
-    case 'v': return '\v';
-    default: assume (false);
-    }
-}
-
 /* Return a pointer to the end of a valid ${VARNAME} string, or NULL.
    'str' should point to the '$' character.
    First letter in VARNAME must be alpha or underscore,
@@ -241,7 +220,7 @@ scan_varname (const char *str)
 static char *
 extract_varname (const char *str)
 {
-  ptrdiff_t i;
+  idx_t i;
   const char *p;
 
   p = scan_varname (str);
@@ -263,150 +242,127 @@ extract_varname (const char *str)
   return varname;
 }
 
-/* Validate the "-S" parameter, according to the syntax defined by FreeBSD's
-   env(1).  Terminate with an error message if not valid.
+/* Temporary buffer used by --split-string processing.  */
+struct splitbuf
+{
+  /* Buffer address, arg count, and half the number of elements in the buffer.
+     ARGC and ARGV are as in 'main', and ARGC + 1 <= HALF_ALLOC so
+     that the upper half of ARGV can be used for string contents.
+     This may waste up to half the space but keeps the code simple,
+     which is better for this rarely-used but security-sensitive code.
+
+     ARGV[0] is not initialized; that is the caller's responsibility
+     after finalization.
+
+     During assembly, ARGV[I] (where 0 < I < ARGC) contains the offset
+     of the Ith string (relative to ARGV + HALF_ALLOC), so that
+     reallocating ARGV does not change the validity of its contents.
+     The integer offset is cast to char * during assembly, and is
+     converted to a true char * pointer on finalization.
+
+     During assembly, ARGV[ARGC] contains the offset of the first
+     unused string byte (relative to ARGV + HALF_ALLOC).  */
+  char **argv;
+  int argc;
+  idx_t half_alloc;
+
+  /* The number of extra argv slots to keep room for.  */
+  int extra_argc;
+
+  /* Whether processing should act as if the most recent character
+     seen was a separator.  */
+  bool sep;
+};
 
-   Calculate and set two values:
-   bufsize - the size (in bytes) required to hold the resulting string
-             after ENVVAR expansion (the value is overestimated).
-   maxargc - the maximum number of arguments (the size of the new argv).  */
+/* Expand SS so that it has at least one more argv slot and at least
+   one more string byte.  */
 static void
-validate_split_str (const char *str, ptrdiff_t *bufsize, int *maxargc)
+splitbuf_grow (struct splitbuf *ss)
 {
-  bool dq, sq, sp;
-  const char *pch;
-  ptrdiff_t buflen;
-  int cnt = 1;
+  idx_t old_half_alloc = ss->half_alloc;
+  idx_t string_bytes = (intptr_t) ss->argv[ss->argc];
+  ss->argv = xpalloc (ss->argv, &ss->half_alloc, 1,
+                      MIN (INT_MAX, IDX_MAX), 2 * sizeof *ss->argv);
+  memmove (ss->argv + ss->half_alloc, ss->argv + old_half_alloc, string_bytes);
+}
 
-  dq = sq = sp = false;
-  buflen = strlen (str) + 1;
+/* In SS, append C to the last string.  */
+static void
+splitbuf_append_byte (struct splitbuf *ss, char c)
+{
+  idx_t string_bytes = (intptr_t) ss->argv[ss->argc];
+  if (ss->half_alloc * sizeof *ss->argv <= string_bytes)
+    splitbuf_grow (ss);
+  ((char *) (ss->argv + ss->half_alloc))[string_bytes] = c;
+  ss->argv[ss->argc] = (char *) (intptr_t) (string_bytes + 1);
+}
 
-  while (*str)
+/* If SS's most recent character was a separator, finish off its
+   previous argument and start a new one.  */
+static void
+check_start_new_arg (struct splitbuf *ss)
+{
+  if (ss->sep)
     {
-      const char next = str[1];
-
-      if (c_isspace (*str) && !dq && !sq)
-        {
-          sp = true;
-        }
-      else
-        {
-          if (sp)
-            ++cnt;
-          sp = false;
-        }
-
-      switch (*str)
-        {
-        case '\'':
-          sq = !sq && !dq;
-          break;
-
-        case '"':
-          dq = !sq && !dq;
-          break;
-
-        case '\\':
-          if (dq && next == 'c')
-            die (EXIT_CANCELED, 0,
-                 _("'\\c' must not appear in double-quoted -S string"));
-
-          if (next == '\0')
-            die (EXIT_CANCELED, 0,
-                 _("invalid backslash at end of string in -S"));
-
-          if (!valid_escape_sequence (next))
-            die (EXIT_CANCELED, 0, _("invalid sequence '\\%c' in -S"), next);
-
-          if (next == '_')
-            ++cnt;
-
-          ++str;
-          break;
-
-
-        case '$':
-          if (sq)
-            break;
-
-          pch = extract_varname (str);
-          if (! pch)
-            die (EXIT_CANCELED, 0, _("only ${VARNAME} expansion is supported,"\
-                                     " error at: %s"), str);
-
-          pch = getenv (pch);
-          if (pch)
-            buflen += strlen (pch);
-          break;
-        }
-      ++str;
+      splitbuf_append_byte (ss, '\0');
+      int argc = ss->argc;
+      if (ss->half_alloc <= argc + ss->extra_argc + 1)
+        splitbuf_grow (ss);
+      ss->argv[argc + 1] = ss->argv[argc];
+      ss->argc = argc + 1;
+      ss->sep = false;
     }
+}
 
-  if (dq || sq)
-    die (EXIT_CANCELED, 0, _("no terminating quote in -S string"));
-
-  *maxargc = cnt;
-  *bufsize = buflen;
+/* All additions to SS have been made.  Convert its offsets to pointers,
+   and return the resulting argument vector.  */
+static char **
+splitbuf_finishup (struct splitbuf *ss)
+{
+  int argc = ss->argc;
+  char **argv = ss->argv;
+  char *stringbase = (char *) (ss->argv + ss->half_alloc);
+  for (int i = 1; i < argc; i++)
+    argv[i] = stringbase + (intptr_t) argv[i];
+  return argv;
 }
 
-/* Return a newly-allocated *arg[]-like array,
+/* Return a newly-allocated argv-like array,
    by parsing and splitting the input 'str'.
+
    'extra_argc' is the number of additional elements to allocate
    in the array (on top of the number of args required to split 'str').
 
+   Store into *argc the number of arguments found (plus 1 for
+   the program name).
+
    Example:
-     char **argv = build_argv ("A=B uname -k', 3)
+     int argc;
+     char **argv = build_argv ("A=B uname -k', 3, &argc);
    Results in:
-     argv[0] = "DUMMY" - dummy executable name, can be replaced later.
+     argc = 4
+     argv[0] = [not initialized]
      argv[1] = "A=B"
      argv[2] = "uname"
      argv[3] = "-k"
-     argv[4] = NULL
-     argv[5,6,7] = [allocated due to extra_argc, but not initialized]
+     argv[4,5,6,7] = [allocated due to extra_argc + 1, but not initialized]
 
-   The strings are stored in an allocated buffer, pointed by argv[0].
    To free allocated memory:
-     free (argv[0]);
-     free (argv); */
+     free (argv);
+   However, 'env' does not free since it's about to exec or exit anyway
+   and the complexity of keeping track of the storage that may have been
+   allocated via multiple calls to build_argv is not worth the hassle.  */
 static char **
-build_argv (const char *str, int extra_argc)
+build_argv (const char *str, int extra_argc, int *argc)
 {
-  bool dq = false, sq = false, sep = true;
-
-  /* Buffer to hold the new argv values.  Allocated as one buffer, but
-     will contain multiple NUL-terminate strings.  */
-  char *dest;
-
-  char **newargv, **nextargv;
-  int newargc = 0;
-  ptrdiff_t buflen = 0;
-
-  /* This macro is called before inserting any characters to the output
-     buffer.  It checks if the previous character was a separator
-     and if so starts a new argv element.  */
-#define CHECK_START_NEW_ARG                     \
-  do {                                          \
-    if (sep)                                    \
-      {                                         \
-        *dest++ = '\0';                         \
-        *nextargv++ = dest;                     \
-        sep = false;                            \
-      }                                         \
-  } while (0)
-
-  validate_split_str (str, &buflen, &newargc);
-
-  /* Allocate buffer.  +6 for the "DUMMY\0" executable name, +1 for NUL.  */
-  dest = xmalloc (buflen + 6 + 1);
-
-  /* Allocate the argv array.
-     +2 for the program name (argv[0]) and the last NULL pointer.  */
-  nextargv = newargv = xmalloc ((newargc + extra_argc + 2) * sizeof (char *));
-
-  /* argv[0] = executable's name  - will be replaced later.  */
-  strcpy (dest, "DUMMY");
-  *nextargv++ = dest;
-  dest += 6;
+  bool dq = false, sq = false;
+  struct splitbuf ss;
+  ss.argv = xnmalloc (extra_argc + 2, 2 * sizeof *ss.argv);
+  ss.argc = 1;
+  ss.half_alloc = extra_argc + 2;
+  ss.extra_argc = extra_argc;
+  ss.sep = true;
+  ss.argv[ss.argc] = 0;
 
   /* In the following loop,
      'break' causes the character 'newc' to be added to *dest,
@@ -421,7 +377,7 @@ build_argv (const char *str, int extra_argc)
           if (dq)
             break;
           sq = !sq;
-          CHECK_START_NEW_ARG;
+          check_start_new_arg (&ss);
           ++str;
           continue;
 
@@ -429,7 +385,7 @@ build_argv (const char *str, int extra_argc)
           if (sq)
             break;
           dq = !dq;
-          CHECK_START_NEW_ARG;
+          check_start_new_arg (&ss);
           ++str;
           continue;
 
@@ -437,12 +393,12 @@ build_argv (const char *str, int extra_argc)
           /* Start a new argument if outside quotes.  */
           if (sq || dq)
             break;
-          sep = true;
+          ss.sep = true;
           str += strspn (str, C_ISSPACE_CHARS);
           continue;
 
         case '#':
-          if (!sep)
+          if (!ss.sep)
             break;
           goto eos; /* '#' as first char terminates the string.  */
 
@@ -454,26 +410,41 @@ build_argv (const char *str, int extra_argc)
 
           /* Skip the backslash and examine the next character.  */
           newc = *++str;
-          if (newc == '\\' || newc == '\''
-              || (!sq && (newc == '#' || newc == '$' || newc == '"')))
+          switch (newc)
             {
+            case '"': case '#': case '$': case '\'': case '\\':
               /* Pass escaped character as-is.  */
-            }
-          else if (newc == '_')
-            {
+              break;
+
+            case '_':
               if (!dq)
                 {
                   ++str;  /* '\_' outside double-quotes is arg separator.  */
-                  sep = true;
+                  ss.sep = true;
                   continue;
                 }
-              else
-                newc = ' ';  /* '\_' inside double-quotes is space.  */
+              newc = ' ';  /* '\_' inside double-quotes is space.  */
+              break;
+
+            case 'c':
+              if (dq)
+                die (EXIT_CANCELED, 0,
+                     _("'\\c' must not appear in double-quoted -S string"));
+              goto eos; /* '\c' terminates the string.  */
+
+            case 'f': newc = '\f'; break;
+            case 'n': newc = '\n'; break;
+            case 'r': newc = '\r'; break;
+            case 't': newc = '\t'; break;
+            case 'v': newc = '\v'; break;
+
+            case '\0':
+              die (EXIT_CANCELED, 0,
+                   _("invalid backslash at end of string in -S"));
+
+            default:
+              die (EXIT_CANCELED, 0, _("invalid sequence '\\%c' in -S"), newc);
             }
-          else if (newc == 'c')
-            goto eos; /* '\c' terminates the string.  */
-          else
-            newc = escape_char (newc);  /* Other characters (e.g., '\n').  */
           break;
 
         case '$':
@@ -484,12 +455,18 @@ build_argv (const char *str, int extra_argc)
           /* Store the ${VARNAME} value.  */
           {
             char *n = extract_varname (str);
+            if (!n)
+              die (EXIT_CANCELED, 0,
+                   _("only ${VARNAME} expansion is supported, error at: %s"),
+                   str);
+
             char *v = getenv (n);
             if (v)
               {
-                CHECK_START_NEW_ARG;
+                check_start_new_arg (&ss);
                 devmsg ("expanding ${%s} into %s\n", n, quote (v));
-                dest = stpcpy (dest, v);
+                for (; *v; v++)
+                  splitbuf_append_byte (&ss, *v);
               }
             else
               devmsg ("replacing ${%s} with null string\n", n);
@@ -499,16 +476,18 @@ build_argv (const char *str, int extra_argc)
           }
         }
 
-      CHECK_START_NEW_ARG;
-      *dest++ = newc;
+      check_start_new_arg (&ss);
+      splitbuf_append_byte (&ss, newc);
       ++str;
     }
 
- eos:
-  *dest = '\0';
-  *nextargv = NULL; /* Mark the last element in argv as NULL.  */
+  if (dq || sq)
+    die (EXIT_CANCELED, 0, _("no terminating quote in -S string"));
 
-  return newargv;
+ eos:
+  splitbuf_append_byte (&ss, '\0');
+  *argc = ss.argc;
+  return splitbuf_finishup (&ss);
 }
 
 /* Process an "-S" string and create the corresponding argv array.
@@ -517,67 +496,47 @@ build_argv (const char *str, int extra_argc)
    Example: if executed as:
       $ env -S"-i -C/tmp A=B" foo bar
    The input argv is:
-      argv[0] = 'env'
+      argv[0] = "env"
       argv[1] = "-S-i -C/tmp A=B"
-      argv[2] = foo
-      argv[3] = bar
+      argv[2] = "foo"
+      argv[3] = "bar"
+      argv[4] = NULL
    This function will modify argv to be:
-      argv[0] = 'env'
+      argv[0] = "env"
       argv[1] = "-i"
       argv[2] = "-C/tmp"
-      argv[3] =  A=B"
-      argv[4] = foo
-      argv[5] = bar
+      argv[3] = "A=B"
+      argv[4] = "foo"
+      argv[5] = "bar"
+      argv[6] = NULL
    argc will be updated from 4 to 6.
    optind will be reset to 0 to force getopt_long to rescan all arguments.  */
 static void
 parse_split_string (const char *str, int *orig_optind,
                     int *orig_argc, char ***orig_argv)
 {
-  int i, newargc;
-  char **newargv, **nextargv;
-
-
-  while (c_isspace (*str))
-    str++;
-  if (*str == '\0')
-    return;
-
-  newargv = build_argv (str, *orig_argc - *orig_optind);
+  int extra_argc = *orig_argc - *orig_optind, newargc;
+  char **newargv = build_argv (str, extra_argc, &newargc);
 
   /* Restore argv[0] - the 'env' executable name.  */
   *newargv = (*orig_argv)[0];
 
-  /* Start from argv[1] */
-  nextargv = newargv + 1;
-
-  /* Print parsed arguments */
-  if (dev_debug && *nextargv)
+  /* Print parsed arguments.  */
+  if (dev_debug && 1 < newargc)
     {
       devmsg ("split -S:  %s\n", quote (str));
-      devmsg (" into:    %s\n", quote (*nextargv++));
-      while (*nextargv)
-        devmsg ("     &    %s\n", quote (*nextargv++));
+      devmsg (" into:    %s\n", quote (newargv[1]));
+      for (int i = 2; i < newargc; i++)
+        devmsg ("     &    %s\n", quote (newargv[i]));
     }
-  else
-    {
-      /* Ensure nextargv points to the last argument */
-      while (*nextargv)
-        ++nextargv;
-    }
-
-  /* Add remaining arguments from original command line */
-  for (i = *orig_optind; i < *orig_argc; ++i)
-    *nextargv++ = (*orig_argv)[i];
-  *nextargv = NULL;
 
-  /* Count how many new arguments we have */
-  newargc = 0;
-  for (nextargv = newargv; *nextargv; ++nextargv)
-    ++newargc;
+  /* Add remaining arguments and terminating null from the original
+     command line.  */
+  memcpy (newargv + newargc, *orig_argv + *orig_optind,
+          (extra_argc + 1) * sizeof *newargv);
 
   /* Set new values for original getopt variables.  */
-  *orig_argc = newargc;
+  *orig_argc = newargc + extra_argc;
   *orig_argv = newargv;
   *orig_optind = 0; /* Tell getopt to restart from first argument.  */
 }
-- 
2.25.1

Reply via email to