Hey,

This overhauls dd(1) operand parsing for expressions.

 - Eliminate duplicate code by using a single routine, get_expr().
   If we check for negative inputs we can use strtoull(3) safely,
   which covers the accepted input range for both off_t and size_t.

 - We can then use a single switch statement -- refactored here
   into a new routine, get_multiplier() -- for suffix evaluation.

 - This also lets us consolidate error handling.  We can provide
   better error messages if we catch invalid inputs in-place.  For
   instance, we no longer need to double-check whether cbs is zero,
   or if any of the other block sizes managed to wind up zero.

 - Parse iteratively with strsep(3).  The recursive parsing is clever,
   but it provides no advantages over iterative parsing.  It also allows
   for some pathological cases like the following, which overflows the
   stack on macppc:

        count=$(perl -e 'print "1x"x100000, "1"')

   Using strsep(3) is briefer and more familiar.

 - Catch all (I think?) multiplicative overflow.  Formerly, you could
   get away with something like:

        2048x2147483649

   on a 32-bit architecture, or

        2048x4611686018427387905

   on a 64-bit architecture.

   No more.  We check for overflow on the input, overflow
   when multiplying the input by the value of the suffix,
   and overflow when multiplying a new term by the product
   of the prior terms in the expression.

 - Standard dd(1) does not support hexidecimal or octal
   input.  BSD dd(1) does [1], but GNU dd(1) does not.

   Supporting both 'x' multiplication and hex input might
   be useful, but I think it's a bad idea from a parsing
   perspective.  I'm not sure if it makes the grammar ambiguous,
   but it does make it harder to reason about at a glance.
   What is the value of the following?

        2x0x10K

   For us it's both a valid block size and a valid seek, but
   in GNU dd it's only valid as a seek or a count.

   That's a bit pathological, but I don't think enforcing decimal
   input, as prescribed by POSIX and adhered to by ancient dd(1) [1],
   makes dd(1) more annoying to use.  If you want hex or octal input
   you can eval it with the shell:

        bs=$((0xFF))

   Perusing, e.g., github, this seems to be what most people
   do when they want hex in their scripts.

   Given that GNU dd doesn't try to parse hex input I'm pretty sure
   this will break nothing in ports, but I can audit if someone
   thinks it's worth it.

 - Somewhat less important, but using strtoull in a single routine
   as mentioned above uncouples the "count" and "files" operands from
   block sizes and offsets, which seems more appropriate.  They're
   counts, so they shouldn't be size_t, you know?

 - Disallow leading whitespace and leading plus ('+') characters.  We
   were already checking for leading minus ('-') characters with strchr(3),
   but just checking whether the first character is a digit is simpler.

   My thinking is that we should probably not allow something like the
   following:

        bs="    +64"

   which looks like it could be an error in someone's script.

   ... but this might be too strict.

Thoughts?

--
Scott Cheloha

[1] This seems to have happened when Keith Muller rewrote it.

    Ancient BSD dd(1) only took integers:

https://github.com/dspinellis/unix-history-repo/blob/ebe7a4befa43c108621eccfe74a9d3afdffce0e/usr/src/bin/dd/dd.c#L389

    But with the rewrite ca. 1991, operand parsing moved to args.c
    and since then, hex/octal has been OK:

https://github.com/dspinellis/unix-history-repo/blob/71251a9529e71ac9602579b8b5fc5f8e4979d3a/usr/src/bin/dd/args.c#L344

    I have no idea if this was accidental or not.

Index: bin/dd/args.c
===================================================================
RCS file: /cvs/src/bin/dd/args.c,v
retrieving revision 1.29
diff -u -p -r1.29 args.c
--- bin/dd/args.c       3 Jan 2018 19:12:20 -0000       1.29
+++ bin/dd/args.c       30 Mar 2018 02:23:57 -0000
@@ -39,8 +39,10 @@
 
 #include <err.h>
 #include <errno.h>
+#include <ctype.h>
 #include <limits.h>
 #include <stdio.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -62,6 +64,9 @@ static void   f_skip(char *);
 static void    f_status(char *);
 static size_t  get_bsz(char *);
 static off_t   get_off(char *);
+static unsigned long long
+               get_expr(const char *, unsigned long long, unsigned long long);
+static long long get_multiplier(const char *);
 
 static const struct arg {
        const char *name;
@@ -94,6 +99,7 @@ jcl(char **argv)
        char *arg;
 
        in.dbsz = out.dbsz = 512;
+       files_cnt = 1;
 
        while ((oper = *++argv) != NULL) {
                if ((oper = strdup(oper)) == NULL)
@@ -138,8 +144,6 @@ jcl(char **argv)
        if (ddflags & (C_BLOCK|C_UNBLOCK)) {
                if (!(ddflags & C_CBS))
                        errx(1, "record operations require cbs");
-               if (cbsz == 0)
-                       errx(1, "cbs cannot be zero");
                cfunc = ddflags & C_BLOCK ? block : unblock;
        } else if (ddflags & C_CBS) {
                if (ddflags & (C_ASCII|C_EBCDIC)) {
@@ -152,23 +156,14 @@ jcl(char **argv)
                        }
                } else
                        errx(1, "cbs meaningless if not doing record 
operations");
-               if (cbsz == 0)
-                       errx(1, "cbs cannot be zero");
        } else
                cfunc = def;
 
-       if (in.dbsz == 0 || out.dbsz == 0)
-               errx(1, "buffer sizes cannot be zero");
-
-       /*
-        * Read and write take size_t's as arguments.  Lseek, however,
-        * takes an off_t.
-        */
-       if (cbsz > SSIZE_MAX || in.dbsz > SSIZE_MAX || out.dbsz > SSIZE_MAX)
-               errx(1, "buffer sizes cannot be greater than %zd",
-                   (ssize_t)SSIZE_MAX);
-       if (in.offset > LLONG_MAX / in.dbsz || out.offset > LLONG_MAX / 
out.dbsz)
-               errx(1, "seek offsets cannot be larger than %lld", LLONG_MAX);
+       /* Ensure the seek won't overflow. */
+       if (in.offset != 0 && in.offset > LLONG_MAX / in.dbsz)
+               errx(1, "total skip is too large");
+       if (out.offset != 0 && out.dbsz > LLONG_MAX / out.offset)
+               errx(1, "total seek is too large");
 }
 
 static int
@@ -196,15 +191,14 @@ static void
 f_count(char *arg)
 {
 
-       if ((cpy_cnt = get_bsz(arg)) == 0)
-               cpy_cnt = (size_t)-1;
+       cpy_cnt = get_expr(arg, 0, ULLONG_MAX);
 }
 
 static void
 f_files(char *arg)
 {
 
-       files_cnt = get_bsz(arg);
+       files_cnt = get_expr(arg, 0, ULLONG_MAX);
 }
 
 static void
@@ -307,165 +301,108 @@ f_conv(char *arg)
        }
 }
 
-/*
- * Convert an expression of the following forms to a size_t
- *     1) A positive decimal number, optionally followed by
- *             b - multiply by 512.
- *             k, m or g - multiply by 1024 each.
- *             w - multiply by sizeof int
- *     2) Two or more of the above, separated by x
- *        (or * for backwards compatibility), specifying
- *        the product of the indicated values.
- */
 static size_t
-get_bsz(char *val)
+get_bsz(char *arg)
 {
-       size_t num, t;
-       char *expr;
+       /* read(2) and write(2) can't handle more than SSIZE_MAX bytes. */
+       return get_expr(arg, 1, SSIZE_MAX);
+}
 
-       if (strchr(val, '-'))
-               errx(1, "%s: illegal numeric value", oper);
+static off_t
+get_off(char *arg)
+{
+       return get_expr(arg, 0, LLONG_MAX);
+}
 
-       errno = 0;
-       num = strtoul(val, &expr, 0);
-       if (num == ULONG_MAX && errno == ERANGE)        /* Overflow. */
-               err(1, "%s", oper);
-       if (expr == val)                        /* No digits. */
-               errx(1, "%s: illegal numeric value", oper);
+/*
+ * An expression is composed of one or more terms separated by exes ('x').
+ * Terms separated by asterisks ('*') are supported for backwards
+ * compatibility.  The value of an expression is the product of its terms.
+ *
+ * A term is composed of a non-negative decimal integer and an optional
+ * suffix of a single character.  Suffixes are valued as follows:
+ *
+ *     b   - 512               ("block")
+ *     G|g - 1073741824        ("gigabyte")
+ *     K|k - 1024              ("kilobyte")
+ *     M|m - 1048576           ("megabyte")
+ *     w   - sizeof(int)       ("word")
+ *
+ * The value of a term without a suffix is that of its integer.  The value
+ * of a term with a suffix is the product of its integer and its suffix.
+ */
+static unsigned long long
+get_expr(const char *arg, unsigned long long min, unsigned long long max)
+{
+       unsigned long long number, value;
+       long long multiplier;
+       char *expr, *term, *suffix;
 
-       switch(*expr) {
-       case 'b':
-               t = num;
-               num *= 512;
-               if (t > num)
-                       goto erange;
-               ++expr;
-               break;
-       case 'g':
-       case 'G':
-               t = num;
-               num *= 1024;
-               if (t > num)
-                       goto erange;
-               /* fallthrough */
-       case 'm':
-       case 'M':
-               t = num;
-               num *= 1024;
-               if (t > num)
-                       goto erange;
-               /* fallthrough */
-       case 'k':
-       case 'K':
-               t = num;
-               num *= 1024;
-               if (t > num)
-                       goto erange;
-               ++expr;
-               break;
-       case 'w':
-               t = num;
-               num *= sizeof(int);
-               if (t > num)
-                       goto erange;
-               ++expr;
-               break;
-       }
+       value = 1;
+
+       expr = strdup(arg);
+       if (expr == NULL)
+               err(1, NULL);
 
-       switch(*expr) {
-               case '\0':
-                       break;
-               case '*':                       /* Backward compatible. */
-               case 'x':
-                       t = num;
-                       num *= get_bsz(expr + 1);
-                       if (t > num)
-                               goto erange;
-                       break;
-               default:
-                       errx(1, "%s: illegal numeric value", oper);
+       while ((term = strsep(&expr, "*x")) != NULL) {
+               /*
+                * strtoull(3) ignores leading whitespace and quietly
+                * underflows terms prefixed with a '-', so we have to check.
+                */
+               if (!isdigit((unsigned char)term[0]))
+                       errx(1, "%s is invalid: %s", oper, arg);
+               errno = 0;
+               number = strtoull(term, &suffix, 10);
+               if (term[0] == '\0' || suffix == term)
+                       errx(1, "%s is invalid: %s", oper, arg);
+               if (number < min)
+                       errx(1, "%s is too small: %s", oper, arg);
+               if (number > max || (number == ULLONG_MAX && errno == ERANGE))
+                       errx(1, "%s is too large: %s", oper, arg);
+               multiplier = get_multiplier(suffix);
+               if (multiplier == -1)
+                       errx(1, "%s is invalid: %s", oper, arg);
+               if (number > max / multiplier ||
+                   (value > 0 && number * multiplier > max / value))
+                       errx(1, "%s is too large: %s", oper, arg);
+               value *= number * multiplier;
        }
-       return (num);
-erange:
-       errc(1, ERANGE, "%s", oper);
+       free(expr);
+       return (value);
 }
 
-/*
- * Convert an expression of the following forms to an off_t
- *     1) A positive decimal number, optionally followed by
- *             b - multiply by 512.
- *             k, m or g - multiply by 1024 each.
- *             w - multiply by sizeof int
- *     2) Two or more of the above, separated by x
- *        (or * for backwards compatibility), specifying
- *        the product of the indicated values.
- */
-static off_t
-get_off(char *val)
+static long long
+get_multiplier(const char *str)
 {
-       off_t num, t;
-       char *expr;
-
-       errno = 0;
-       num = strtoll(val, &expr, 0);
-       if (num == LLONG_MAX && errno == ERANGE)        /* Overflow. */
-               err(1, "%s", oper);
-       if (expr == val)                        /* No digits. */
-               errx(1, "%s: illegal numeric value", oper);
+       long long value;
+       int unknown = 0;
 
-       switch(*expr) {
+       switch(str[0]) {
+       case '\0':
+               value = 1;
+               break;
        case 'b':
-               t = num;
-               num *= 512;
-               if (t > num)
-                       goto erange;
-               ++expr;
+               value = 512;
                break;
-       case 'g':
        case 'G':
-               t = num;
-               num *= 1024;
-               if (t > num)
-                       goto erange;
-               /* fallthrough */
-       case 'm':
-       case 'M':
-               t = num;
-               num *= 1024;
-               if (t > num)
-                       goto erange;
-               /* fallthrough */
-       case 'k':
+       case 'g':
+               value = 1073741824LL;
+               break;
        case 'K':
-               t = num;
-               num *= 1024;
-               if (t > num)
-                       goto erange;
-               ++expr;
+       case 'k':
+               value = 1024;
+               break;
+       case 'M':
+       case 'm':
+               value = 1048576;
                break;
        case 'w':
-               t = num;
-               num *= sizeof(int);
-               if (t > num)
-                       goto erange;
-               ++expr;
+               value = sizeof(int);
                break;
+       default:
+               unknown = 1;
        }
-
-       switch(*expr) {
-               case '\0':
-                       break;
-               case '*':                       /* Backward compatible. */
-               case 'x':
-                       t = num;
-                       num *= get_off(expr + 1);
-                       if (t > num)
-                               goto erange;
-                       break;
-               default:
-                       errx(1, "%s: illegal numeric value", oper);
-       }
-       return (num);
-erange:
-       errc(1, ERANGE, "%s", oper);
+       if (unknown || (str[0] != '\0' && str[1] != '\0'))
+               value = -1;
+       return (value);
 }
Index: bin/dd/dd.c
===================================================================
RCS file: /cvs/src/bin/dd/dd.c,v
retrieving revision 1.24
diff -u -p -r1.24 dd.c
--- bin/dd/dd.c 13 Aug 2017 02:06:42 -0000      1.24
+++ bin/dd/dd.c 30 Mar 2018 02:23:57 -0000
@@ -62,10 +62,10 @@ static void setup(void);
 IO     in, out;                /* input/output state */
 STAT   st;                     /* statistics */
 void   (*cfunc)(void);         /* conversion function */
-size_t cpy_cnt;                /* # of blocks to copy */
+unsigned long long cpy_cnt;    /* # of blocks to copy */
 u_int  ddflags;                /* conversion options */
 size_t cbsz;                   /* conversion block size */
-size_t files_cnt = 1;          /* # of files to copy */
+unsigned long long files_cnt;  /* # of files to copy */
 const  u_char  *ctab;          /* conversion table */
 
 int
@@ -79,7 +79,7 @@ main(int argc, char *argv[])
 
        atexit(summary);
 
-       if (cpy_cnt != (size_t)-1) {
+       if (!(ddflags & C_COUNT) || cpy_cnt > 0) {
                while (files_cnt--)
                        dd_in();
        }
@@ -102,7 +102,7 @@ setup(void)
 
        getfdtype(&in);
 
-       if (files_cnt > 1 && !(in.flags & ISTAPE))
+       if ((ddflags & C_FILES) && !(in.flags & ISTAPE))
                errx(1, "files is not supported for non-tape devices");
 
        if (out.name == NULL) {
@@ -232,7 +232,7 @@ dd_in(void)
        ssize_t n;
 
        for (;;) {
-               if (cpy_cnt && (st.in_full + st.in_part) >= cpy_cnt)
+               if ((ddflags & C_COUNT) && st.in_full + st.in_part >= cpy_cnt)
                        return;
 
                /*
Index: bin/dd/extern.h
===================================================================
RCS file: /cvs/src/bin/dd/extern.h,v
retrieving revision 1.9
diff -u -p -r1.9 extern.h
--- bin/dd/extern.h     27 Mar 2014 15:32:13 -0000      1.9
+++ bin/dd/extern.h     30 Mar 2018 02:23:57 -0000
@@ -53,10 +53,10 @@ void unblock_close(void);
 extern IO in, out;
 extern STAT st;
 extern void (*cfunc)(void);
-extern size_t cpy_cnt;
+extern unsigned long long cpy_cnt;
 extern size_t cbsz;
 extern u_int ddflags;
-extern size_t files_cnt;
+extern unsigned long long files_cnt;
 extern const u_char *ctab;
 extern const u_char a2e_POSIX[];
 extern const u_char e2a_POSIX[];

Reply via email to