Hey, This overhauls dd(1) operand parsing for expressions.
- Eliminate duplicate code by using a single routine, get_expr(). If we check for negative inputs we can use strtoull(3) safely, which covers the accepted input range for both off_t and size_t. - We can then use a single switch statement -- refactored here into a new routine, get_multiplier() -- for suffix evaluation. - This also lets us consolidate error handling. We can provide better error messages if we catch invalid inputs in-place. For instance, we no longer need to double-check whether cbs is zero, or if any of the other block sizes managed to wind up zero. - Parse iteratively with strsep(3). The recursive parsing is clever, but it provides no advantages over iterative parsing. It also allows for some pathological cases like the following, which overflows the stack on macppc: count=$(perl -e 'print "1x"x100000, "1"') Using strsep(3) is briefer and more familiar. - Catch all (I think?) multiplicative overflow. Formerly, you could get away with something like: 2048x2147483649 on a 32-bit architecture, or 2048x4611686018427387905 on a 64-bit architecture. No more. We check for overflow on the input, overflow when multiplying the input by the value of the suffix, and overflow when multiplying a new term by the product of the prior terms in the expression. - Standard dd(1) does not support hexidecimal or octal input. BSD dd(1) does [1], but GNU dd(1) does not. Supporting both 'x' multiplication and hex input might be useful, but I think it's a bad idea from a parsing perspective. I'm not sure if it makes the grammar ambiguous, but it does make it harder to reason about at a glance. What is the value of the following? 2x0x10K For us it's both a valid block size and a valid seek, but in GNU dd it's only valid as a seek or a count. That's a bit pathological, but I don't think enforcing decimal input, as prescribed by POSIX and adhered to by ancient dd(1) [1], makes dd(1) more annoying to use. If you want hex or octal input you can eval it with the shell: bs=$((0xFF)) Perusing, e.g., github, this seems to be what most people do when they want hex in their scripts. Given that GNU dd doesn't try to parse hex input I'm pretty sure this will break nothing in ports, but I can audit if someone thinks it's worth it. - Somewhat less important, but using strtoull in a single routine as mentioned above uncouples the "count" and "files" operands from block sizes and offsets, which seems more appropriate. They're counts, so they shouldn't be size_t, you know? - Disallow leading whitespace and leading plus ('+') characters. We were already checking for leading minus ('-') characters with strchr(3), but just checking whether the first character is a digit is simpler. My thinking is that we should probably not allow something like the following: bs=" +64" which looks like it could be an error in someone's script. ... but this might be too strict. Thoughts? -- Scott Cheloha [1] This seems to have happened when Keith Muller rewrote it. Ancient BSD dd(1) only took integers: https://github.com/dspinellis/unix-history-repo/blob/ebe7a4befa43c108621eccfe74a9d3afdffce0e/usr/src/bin/dd/dd.c#L389 But with the rewrite ca. 1991, operand parsing moved to args.c and since then, hex/octal has been OK: https://github.com/dspinellis/unix-history-repo/blob/71251a9529e71ac9602579b8b5fc5f8e4979d3a/usr/src/bin/dd/args.c#L344 I have no idea if this was accidental or not. Index: bin/dd/args.c =================================================================== RCS file: /cvs/src/bin/dd/args.c,v retrieving revision 1.29 diff -u -p -r1.29 args.c --- bin/dd/args.c 3 Jan 2018 19:12:20 -0000 1.29 +++ bin/dd/args.c 30 Mar 2018 02:23:57 -0000 @@ -39,8 +39,10 @@ #include <err.h> #include <errno.h> +#include <ctype.h> #include <limits.h> #include <stdio.h> +#include <stdint.h> #include <stdlib.h> #include <string.h> @@ -62,6 +64,9 @@ static void f_skip(char *); static void f_status(char *); static size_t get_bsz(char *); static off_t get_off(char *); +static unsigned long long + get_expr(const char *, unsigned long long, unsigned long long); +static long long get_multiplier(const char *); static const struct arg { const char *name; @@ -94,6 +99,7 @@ jcl(char **argv) char *arg; in.dbsz = out.dbsz = 512; + files_cnt = 1; while ((oper = *++argv) != NULL) { if ((oper = strdup(oper)) == NULL) @@ -138,8 +144,6 @@ jcl(char **argv) if (ddflags & (C_BLOCK|C_UNBLOCK)) { if (!(ddflags & C_CBS)) errx(1, "record operations require cbs"); - if (cbsz == 0) - errx(1, "cbs cannot be zero"); cfunc = ddflags & C_BLOCK ? block : unblock; } else if (ddflags & C_CBS) { if (ddflags & (C_ASCII|C_EBCDIC)) { @@ -152,23 +156,14 @@ jcl(char **argv) } } else errx(1, "cbs meaningless if not doing record operations"); - if (cbsz == 0) - errx(1, "cbs cannot be zero"); } else cfunc = def; - if (in.dbsz == 0 || out.dbsz == 0) - errx(1, "buffer sizes cannot be zero"); - - /* - * Read and write take size_t's as arguments. Lseek, however, - * takes an off_t. - */ - if (cbsz > SSIZE_MAX || in.dbsz > SSIZE_MAX || out.dbsz > SSIZE_MAX) - errx(1, "buffer sizes cannot be greater than %zd", - (ssize_t)SSIZE_MAX); - if (in.offset > LLONG_MAX / in.dbsz || out.offset > LLONG_MAX / out.dbsz) - errx(1, "seek offsets cannot be larger than %lld", LLONG_MAX); + /* Ensure the seek won't overflow. */ + if (in.offset != 0 && in.offset > LLONG_MAX / in.dbsz) + errx(1, "total skip is too large"); + if (out.offset != 0 && out.dbsz > LLONG_MAX / out.offset) + errx(1, "total seek is too large"); } static int @@ -196,15 +191,14 @@ static void f_count(char *arg) { - if ((cpy_cnt = get_bsz(arg)) == 0) - cpy_cnt = (size_t)-1; + cpy_cnt = get_expr(arg, 0, ULLONG_MAX); } static void f_files(char *arg) { - files_cnt = get_bsz(arg); + files_cnt = get_expr(arg, 0, ULLONG_MAX); } static void @@ -307,165 +301,108 @@ f_conv(char *arg) } } -/* - * Convert an expression of the following forms to a size_t - * 1) A positive decimal number, optionally followed by - * b - multiply by 512. - * k, m or g - multiply by 1024 each. - * w - multiply by sizeof int - * 2) Two or more of the above, separated by x - * (or * for backwards compatibility), specifying - * the product of the indicated values. - */ static size_t -get_bsz(char *val) +get_bsz(char *arg) { - size_t num, t; - char *expr; + /* read(2) and write(2) can't handle more than SSIZE_MAX bytes. */ + return get_expr(arg, 1, SSIZE_MAX); +} - if (strchr(val, '-')) - errx(1, "%s: illegal numeric value", oper); +static off_t +get_off(char *arg) +{ + return get_expr(arg, 0, LLONG_MAX); +} - errno = 0; - num = strtoul(val, &expr, 0); - if (num == ULONG_MAX && errno == ERANGE) /* Overflow. */ - err(1, "%s", oper); - if (expr == val) /* No digits. */ - errx(1, "%s: illegal numeric value", oper); +/* + * An expression is composed of one or more terms separated by exes ('x'). + * Terms separated by asterisks ('*') are supported for backwards + * compatibility. The value of an expression is the product of its terms. + * + * A term is composed of a non-negative decimal integer and an optional + * suffix of a single character. Suffixes are valued as follows: + * + * b - 512 ("block") + * G|g - 1073741824 ("gigabyte") + * K|k - 1024 ("kilobyte") + * M|m - 1048576 ("megabyte") + * w - sizeof(int) ("word") + * + * The value of a term without a suffix is that of its integer. The value + * of a term with a suffix is the product of its integer and its suffix. + */ +static unsigned long long +get_expr(const char *arg, unsigned long long min, unsigned long long max) +{ + unsigned long long number, value; + long long multiplier; + char *expr, *term, *suffix; - switch(*expr) { - case 'b': - t = num; - num *= 512; - if (t > num) - goto erange; - ++expr; - break; - case 'g': - case 'G': - t = num; - num *= 1024; - if (t > num) - goto erange; - /* fallthrough */ - case 'm': - case 'M': - t = num; - num *= 1024; - if (t > num) - goto erange; - /* fallthrough */ - case 'k': - case 'K': - t = num; - num *= 1024; - if (t > num) - goto erange; - ++expr; - break; - case 'w': - t = num; - num *= sizeof(int); - if (t > num) - goto erange; - ++expr; - break; - } + value = 1; + + expr = strdup(arg); + if (expr == NULL) + err(1, NULL); - switch(*expr) { - case '\0': - break; - case '*': /* Backward compatible. */ - case 'x': - t = num; - num *= get_bsz(expr + 1); - if (t > num) - goto erange; - break; - default: - errx(1, "%s: illegal numeric value", oper); + while ((term = strsep(&expr, "*x")) != NULL) { + /* + * strtoull(3) ignores leading whitespace and quietly + * underflows terms prefixed with a '-', so we have to check. + */ + if (!isdigit((unsigned char)term[0])) + errx(1, "%s is invalid: %s", oper, arg); + errno = 0; + number = strtoull(term, &suffix, 10); + if (term[0] == '\0' || suffix == term) + errx(1, "%s is invalid: %s", oper, arg); + if (number < min) + errx(1, "%s is too small: %s", oper, arg); + if (number > max || (number == ULLONG_MAX && errno == ERANGE)) + errx(1, "%s is too large: %s", oper, arg); + multiplier = get_multiplier(suffix); + if (multiplier == -1) + errx(1, "%s is invalid: %s", oper, arg); + if (number > max / multiplier || + (value > 0 && number * multiplier > max / value)) + errx(1, "%s is too large: %s", oper, arg); + value *= number * multiplier; } - return (num); -erange: - errc(1, ERANGE, "%s", oper); + free(expr); + return (value); } -/* - * Convert an expression of the following forms to an off_t - * 1) A positive decimal number, optionally followed by - * b - multiply by 512. - * k, m or g - multiply by 1024 each. - * w - multiply by sizeof int - * 2) Two or more of the above, separated by x - * (or * for backwards compatibility), specifying - * the product of the indicated values. - */ -static off_t -get_off(char *val) +static long long +get_multiplier(const char *str) { - off_t num, t; - char *expr; - - errno = 0; - num = strtoll(val, &expr, 0); - if (num == LLONG_MAX && errno == ERANGE) /* Overflow. */ - err(1, "%s", oper); - if (expr == val) /* No digits. */ - errx(1, "%s: illegal numeric value", oper); + long long value; + int unknown = 0; - switch(*expr) { + switch(str[0]) { + case '\0': + value = 1; + break; case 'b': - t = num; - num *= 512; - if (t > num) - goto erange; - ++expr; + value = 512; break; - case 'g': case 'G': - t = num; - num *= 1024; - if (t > num) - goto erange; - /* fallthrough */ - case 'm': - case 'M': - t = num; - num *= 1024; - if (t > num) - goto erange; - /* fallthrough */ - case 'k': + case 'g': + value = 1073741824LL; + break; case 'K': - t = num; - num *= 1024; - if (t > num) - goto erange; - ++expr; + case 'k': + value = 1024; + break; + case 'M': + case 'm': + value = 1048576; break; case 'w': - t = num; - num *= sizeof(int); - if (t > num) - goto erange; - ++expr; + value = sizeof(int); break; + default: + unknown = 1; } - - switch(*expr) { - case '\0': - break; - case '*': /* Backward compatible. */ - case 'x': - t = num; - num *= get_off(expr + 1); - if (t > num) - goto erange; - break; - default: - errx(1, "%s: illegal numeric value", oper); - } - return (num); -erange: - errc(1, ERANGE, "%s", oper); + if (unknown || (str[0] != '\0' && str[1] != '\0')) + value = -1; + return (value); } Index: bin/dd/dd.c =================================================================== RCS file: /cvs/src/bin/dd/dd.c,v retrieving revision 1.24 diff -u -p -r1.24 dd.c --- bin/dd/dd.c 13 Aug 2017 02:06:42 -0000 1.24 +++ bin/dd/dd.c 30 Mar 2018 02:23:57 -0000 @@ -62,10 +62,10 @@ static void setup(void); IO in, out; /* input/output state */ STAT st; /* statistics */ void (*cfunc)(void); /* conversion function */ -size_t cpy_cnt; /* # of blocks to copy */ +unsigned long long cpy_cnt; /* # of blocks to copy */ u_int ddflags; /* conversion options */ size_t cbsz; /* conversion block size */ -size_t files_cnt = 1; /* # of files to copy */ +unsigned long long files_cnt; /* # of files to copy */ const u_char *ctab; /* conversion table */ int @@ -79,7 +79,7 @@ main(int argc, char *argv[]) atexit(summary); - if (cpy_cnt != (size_t)-1) { + if (!(ddflags & C_COUNT) || cpy_cnt > 0) { while (files_cnt--) dd_in(); } @@ -102,7 +102,7 @@ setup(void) getfdtype(&in); - if (files_cnt > 1 && !(in.flags & ISTAPE)) + if ((ddflags & C_FILES) && !(in.flags & ISTAPE)) errx(1, "files is not supported for non-tape devices"); if (out.name == NULL) { @@ -232,7 +232,7 @@ dd_in(void) ssize_t n; for (;;) { - if (cpy_cnt && (st.in_full + st.in_part) >= cpy_cnt) + if ((ddflags & C_COUNT) && st.in_full + st.in_part >= cpy_cnt) return; /* Index: bin/dd/extern.h =================================================================== RCS file: /cvs/src/bin/dd/extern.h,v retrieving revision 1.9 diff -u -p -r1.9 extern.h --- bin/dd/extern.h 27 Mar 2014 15:32:13 -0000 1.9 +++ bin/dd/extern.h 30 Mar 2018 02:23:57 -0000 @@ -53,10 +53,10 @@ void unblock_close(void); extern IO in, out; extern STAT st; extern void (*cfunc)(void); -extern size_t cpy_cnt; +extern unsigned long long cpy_cnt; extern size_t cbsz; extern u_int ddflags; -extern size_t files_cnt; +extern unsigned long long files_cnt; extern const u_char *ctab; extern const u_char a2e_POSIX[]; extern const u_char e2a_POSIX[];