Re: dd(1): overhaul operand parsing

2018-03-31 Thread Scott Cheloha
Updated diff with changes from tobias@:

This patch now removes (u_int) casts from input/output buffer
allocation in dd.c, which was incorrect beforehand anyway, but
with the other changes here was manifesting as a segfault for
combinations of cbs and ibs/obs that overflowed u_int.

--
Scott Cheloha

Index: bin/dd/args.c
===
RCS file: /cvs/src/bin/dd/args.c,v
retrieving revision 1.29
diff -u -p -r1.29 args.c
--- bin/dd/args.c   3 Jan 2018 19:12:20 -   1.29
+++ bin/dd/args.c   31 Mar 2018 17:47:04 -
@@ -39,8 +39,10 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -62,6 +64,9 @@ static void   f_skip(char *);
 static voidf_status(char *);
 static size_t  get_bsz(char *);
 static off_t   get_off(char *);
+static unsigned long long
+   get_expr(const char *, unsigned long long, unsigned long long);
+static long long get_multiplier(const char *);
 
 static const struct arg {
const char *name;
@@ -94,6 +99,7 @@ jcl(char **argv)
char *arg;
 
in.dbsz = out.dbsz = 512;
+   files_cnt = 1;
 
while ((oper = *++argv) != NULL) {
if ((oper = strdup(oper)) == NULL)
@@ -138,8 +144,6 @@ jcl(char **argv)
if (ddflags & (C_BLOCK|C_UNBLOCK)) {
if (!(ddflags & C_CBS))
errx(1, "record operations require cbs");
-   if (cbsz == 0)
-   errx(1, "cbs cannot be zero");
cfunc = ddflags & C_BLOCK ? block : unblock;
} else if (ddflags & C_CBS) {
if (ddflags & (C_ASCII|C_EBCDIC)) {
@@ -152,23 +156,14 @@ jcl(char **argv)
}
} else
errx(1, "cbs meaningless if not doing record 
operations");
-   if (cbsz == 0)
-   errx(1, "cbs cannot be zero");
} else
cfunc = def;
 
-   if (in.dbsz == 0 || out.dbsz == 0)
-   errx(1, "buffer sizes cannot be zero");
-
-   /*
-* Read and write take size_t's as arguments.  Lseek, however,
-* takes an off_t.
-*/
-   if (cbsz > SSIZE_MAX || in.dbsz > SSIZE_MAX || out.dbsz > SSIZE_MAX)
-   errx(1, "buffer sizes cannot be greater than %zd",
-   (ssize_t)SSIZE_MAX);
-   if (in.offset > LLONG_MAX / in.dbsz || out.offset > LLONG_MAX / 
out.dbsz)
-   errx(1, "seek offsets cannot be larger than %lld", LLONG_MAX);
+   /* Ensure the seek won't overflow. */
+   if (in.offset != 0 && in.dbsz > LLONG_MAX / in.offset)
+   errx(1, "total skip is too large");
+   if (out.offset != 0 && out.dbsz > LLONG_MAX / out.offset)
+   errx(1, "total seek is too large");
 }
 
 static int
@@ -196,15 +191,14 @@ static void
 f_count(char *arg)
 {
 
-   if ((cpy_cnt = get_bsz(arg)) == 0)
-   cpy_cnt = (size_t)-1;
+   cpy_cnt = get_expr(arg, 0, ULLONG_MAX);
 }
 
 static void
 f_files(char *arg)
 {
 
-   files_cnt = get_bsz(arg);
+   files_cnt = get_expr(arg, 0, ULLONG_MAX);
 }
 
 static void
@@ -307,165 +301,108 @@ f_conv(char *arg)
}
 }
 
-/*
- * Convert an expression of the following forms to a size_t
- * 1) A positive decimal number, optionally followed by
- * b - multiply by 512.
- * k, m or g - multiply by 1024 each.
- * w - multiply by sizeof int
- * 2) Two or more of the above, separated by x
- *(or * for backwards compatibility), specifying
- *the product of the indicated values.
- */
 static size_t
-get_bsz(char *val)
+get_bsz(char *arg)
 {
-   size_t num, t;
-   char *expr;
+   /* read(2) and write(2) can't handle more than SSIZE_MAX bytes. */
+   return get_expr(arg, 1, SSIZE_MAX);
+}
 
-   if (strchr(val, '-'))
-   errx(1, "%s: illegal numeric value", oper);
+static off_t
+get_off(char *arg)
+{
+   return get_expr(arg, 0, LLONG_MAX);
+}
 
-   errno = 0;
-   num = strtoul(val, , 0);
-   if (num == ULONG_MAX && errno == ERANGE)/* Overflow. */
-   err(1, "%s", oper);
-   if (expr == val)/* No digits. */
-   errx(1, "%s: illegal numeric value", oper);
+/*
+ * An expression is composed of one or more terms separated by exes ('x').
+ * Terms separated by asterisks ('*') are supported for backwards
+ * compatibility.  The value of an expression is the product of its terms.
+ *
+ * A term is composed of a non-negative decimal integer and an optional
+ * suffix of a single character.  Suffixes are valued as follows:
+ *
+ * b   - 512   ("block")
+ * G|g - 1073741824("gigabyte")
+ * K|k - 1024  ("kilobyte")
+ * M|m - 1048576   ("megabyte")
+ * w   - sizeof(int)   ("word")
+ *
+ * The value of a term without a suffix is that 

dd(1): overhaul operand parsing

2018-03-30 Thread Scott Cheloha
Hey,

This overhauls dd(1) operand parsing for expressions.

 - Eliminate duplicate code by using a single routine, get_expr().
   If we check for negative inputs we can use strtoull(3) safely,
   which covers the accepted input range for both off_t and size_t.

 - We can then use a single switch statement -- refactored here
   into a new routine, get_multiplier() -- for suffix evaluation.

 - This also lets us consolidate error handling.  We can provide
   better error messages if we catch invalid inputs in-place.  For
   instance, we no longer need to double-check whether cbs is zero,
   or if any of the other block sizes managed to wind up zero.

 - Parse iteratively with strsep(3).  The recursive parsing is clever,
   but it provides no advantages over iterative parsing.  It also allows
   for some pathological cases like the following, which overflows the
   stack on macppc:

count=$(perl -e 'print "1x"x10, "1"')

   Using strsep(3) is briefer and more familiar.

 - Catch all (I think?) multiplicative overflow.  Formerly, you could
   get away with something like:

2048x2147483649

   on a 32-bit architecture, or

2048x4611686018427387905

   on a 64-bit architecture.

   No more.  We check for overflow on the input, overflow
   when multiplying the input by the value of the suffix,
   and overflow when multiplying a new term by the product
   of the prior terms in the expression.

 - Standard dd(1) does not support hexidecimal or octal
   input.  BSD dd(1) does [1], but GNU dd(1) does not.

   Supporting both 'x' multiplication and hex input might
   be useful, but I think it's a bad idea from a parsing
   perspective.  I'm not sure if it makes the grammar ambiguous,
   but it does make it harder to reason about at a glance.
   What is the value of the following?

2x0x10K

   For us it's both a valid block size and a valid seek, but
   in GNU dd it's only valid as a seek or a count.

   That's a bit pathological, but I don't think enforcing decimal
   input, as prescribed by POSIX and adhered to by ancient dd(1) [1],
   makes dd(1) more annoying to use.  If you want hex or octal input
   you can eval it with the shell:

bs=$((0xFF))

   Perusing, e.g., github, this seems to be what most people
   do when they want hex in their scripts.

   Given that GNU dd doesn't try to parse hex input I'm pretty sure
   this will break nothing in ports, but I can audit if someone
   thinks it's worth it.

 - Somewhat less important, but using strtoull in a single routine
   as mentioned above uncouples the "count" and "files" operands from
   block sizes and offsets, which seems more appropriate.  They're
   counts, so they shouldn't be size_t, you know?

 - Disallow leading whitespace and leading plus ('+') characters.  We
   were already checking for leading minus ('-') characters with strchr(3),
   but just checking whether the first character is a digit is simpler.

   My thinking is that we should probably not allow something like the
   following:

bs="+64"

   which looks like it could be an error in someone's script.

   ... but this might be too strict.

Thoughts?

--
Scott Cheloha

[1] This seems to have happened when Keith Muller rewrote it.

Ancient BSD dd(1) only took integers:

https://github.com/dspinellis/unix-history-repo/blob/ebe7a4befa43c108621eccfe74a9d3afdffce0e/usr/src/bin/dd/dd.c#L389

But with the rewrite ca. 1991, operand parsing moved to args.c
and since then, hex/octal has been OK:

https://github.com/dspinellis/unix-history-repo/blob/71251a9529e71ac9602579b8b5fc5f8e4979d3a/usr/src/bin/dd/args.c#L344

I have no idea if this was accidental or not.

Index: bin/dd/args.c
===
RCS file: /cvs/src/bin/dd/args.c,v
retrieving revision 1.29
diff -u -p -r1.29 args.c
--- bin/dd/args.c   3 Jan 2018 19:12:20 -   1.29
+++ bin/dd/args.c   30 Mar 2018 02:23:57 -
@@ -39,8 +39,10 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -62,6 +64,9 @@ static void   f_skip(char *);
 static voidf_status(char *);
 static size_t  get_bsz(char *);
 static off_t   get_off(char *);
+static unsigned long long
+   get_expr(const char *, unsigned long long, unsigned long long);
+static long long get_multiplier(const char *);
 
 static const struct arg {
const char *name;
@@ -94,6 +99,7 @@ jcl(char **argv)
char *arg;
 
in.dbsz = out.dbsz = 512;
+   files_cnt = 1;
 
while ((oper = *++argv) != NULL) {
if ((oper = strdup(oper)) == NULL)
@@ -138,8 +144,6 @@ jcl(char **argv)
if (ddflags & (C_BLOCK|C_UNBLOCK)) {
if (!(ddflags & C_CBS))
errx(1, "record operations require cbs");
-   if (cbsz == 0)
-   errx(1, "cbs cannot be zero");
cfunc = ddflags & C_BLOCK ? block :