Module Name: src Committed By: kre Date: Mon Aug 21 13:20:49 UTC 2017
Modified Files: src/bin/sh: expand.c parser.c parser.h sh.1 syntax.c syntax.h Log Message: Add support for $'...' quoting (based upon C "..." strings, with \ expansions.) Implementation largely obtained from FreeBSD, with adaptations to meet the needs and style of this sh, some updates to agree with the current POSIX spec, and a few other minor changes. The POSIX spec for this ( http://austingroupbugs.net/view.php?id=249 ) [see note 2809 for the current proposed text] is yet to be approved, so might change. It currently leaves several aspects as unspecified, this implementation handles those as: Where more than 2 hex digits follow \x this implementation processes the first two as hex, the following characters are processed as if the \x sequence was not present. The value obtained from a \nnn octal sequence is truncated to the low 8 bits (if a bigger value is written, eg: \456.) Invalid escape sequences are errors. Invalid \u (or \U) code points are errors if known to be invalid, otherwise can generate a '?' character. Where any escape sequence generates nul ('\0') that char, and the rest of the $'...' string is discarded, but anything remaining in the word is processed, ie: aaa$'bbb\0ccc'ddd produces the same as aaa'bbb'ddd. Differences from FreeBSD: FreeBSD allows only exactly 4 or 8 hex digits for \u and \U (as does C, but the current sh proposal differs.) reeBSD also continues consuming as many hex digits as exist after \x (permitted by the spec, but insane), and reject \u0000 as invalid). Some of this is possibly because that their implementation is based upon an earlier proposal, perhaps note 590 - though that has been updated several times. Differences from the current POSIX proposal: We currently always generate UTF-8 for the \u & \U escapes. We should generate the equivalent character from the current locale's character set (and UTF8 only if that is what the current locale uses.) If anyone would like to correct that, go ahead. We (and FreeBSD) generate (X & 0x1F) for \cX escapes where we should generate the appropriate control character (SOH for \cA for example) with whatever value that has in the current character set. Apart from EBCDIC, which we do not support, I've never seen a case where they differ, so ... To generate a diff of this commit: cvs rdiff -u -r1.119 -r1.120 src/bin/sh/expand.c cvs rdiff -u -r1.143 -r1.144 src/bin/sh/parser.c cvs rdiff -u -r1.23 -r1.24 src/bin/sh/parser.h cvs rdiff -u -r1.163 -r1.164 src/bin/sh/sh.1 cvs rdiff -u -r1.4 -r1.5 src/bin/sh/syntax.c cvs rdiff -u -r1.8 -r1.9 src/bin/sh/syntax.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/bin/sh/expand.c diff -u src/bin/sh/expand.c:1.119 src/bin/sh/expand.c:1.120 --- src/bin/sh/expand.c:1.119 Fri Jun 30 23:02:56 2017 +++ src/bin/sh/expand.c Mon Aug 21 13:20:49 2017 @@ -1,4 +1,4 @@ -/* $NetBSD: expand.c,v 1.119 2017/06/30 23:02:56 kre Exp $ */ +/* $NetBSD: expand.c,v 1.120 2017/08/21 13:20:49 kre Exp $ */ /*- * Copyright (c) 1991, 1993 @@ -37,7 +37,7 @@ #if 0 static char sccsid[] = "@(#)expand.c 8.5 (Berkeley) 5/15/95"; #else -__RCSID("$NetBSD: expand.c,v 1.119 2017/06/30 23:02:56 kre Exp $"); +__RCSID("$NetBSD: expand.c,v 1.120 2017/08/21 13:20:49 kre Exp $"); #endif #endif /* not lint */ @@ -267,6 +267,9 @@ argstr(const char *p, int flag) STPUTC(c, expdest); line_number++; break; + case CTLCNL: + STPUTC('\n', expdest); /* no line_number++ */ + break; case CTLQUOTEEND: ifs_split = EXP_IFS_SPLIT; break; @@ -1842,6 +1845,11 @@ rmescapes(char *str) p++; continue; } + if (*p == CTLCNL) { + p++; + *q++ = '\n'; + continue; + } if (*p == CTLESC) p++; *q++ = *p++; @@ -1883,6 +1891,11 @@ rmescapes_nl(char *str) nls++; continue; } + if (*p == CTLCNL) { + p++; + *q++ = '\n'; + continue; + } if (*p == CTLESC) p++; Index: src/bin/sh/parser.c diff -u src/bin/sh/parser.c:1.143 src/bin/sh/parser.c:1.144 --- src/bin/sh/parser.c:1.143 Sat Aug 5 11:33:05 2017 +++ src/bin/sh/parser.c Mon Aug 21 13:20:49 2017 @@ -1,4 +1,4 @@ -/* $NetBSD: parser.c,v 1.143 2017/08/05 11:33:05 kre Exp $ */ +/* $NetBSD: parser.c,v 1.144 2017/08/21 13:20:49 kre Exp $ */ /*- * Copyright (c) 1991, 1993 @@ -37,7 +37,7 @@ #if 0 static char sccsid[] = "@(#)parser.c 8.7 (Berkeley) 5/16/95"; #else -__RCSID("$NetBSD: parser.c,v 1.143 2017/08/05 11:33:05 kre Exp $"); +__RCSID("$NetBSD: parser.c,v 1.144 2017/08/21 13:20:49 kre Exp $"); #endif #endif /* not lint */ @@ -1212,6 +1212,7 @@ struct tokenstate { #define NQ 0x00 /* Unquoted */ #define SQ 0x01 /* Single Quotes */ #define DQ 0x02 /* Double Quotes (or equivalent) */ +#define CQ 0x03 /* C style Single Quotes */ #define QF 0x0F /* Mask to extract previous values */ #define QS 0x10 /* Quoting started at this level in stack */ @@ -1562,6 +1563,165 @@ parseredir(const char *out, int c) redirnode = np; /* this is the "value" of TRENODE */ } +/* + * Called to parse a backslash escape sequence inside $'...'. + * The backslash has already been read. + */ +static char * +readcstyleesc(char *out) +{ + int c, vc, i, n; + unsigned int v; + + c = pgetc(); + switch (c) { + case '\0': + case PEOF: + synerror("Unterminated quoted string"); + case '\n': + plinno++; + if (doprompt) + setprompt(2); + else + setprompt(0); + return out; + + case '\\': + case '\'': + case '"': + v = c; + break; + + case 'a': v = '\a'; break; + case 'b': v = '\b'; break; + case 'e': v = '\033'; break; + case 'f': v = '\f'; break; + case 'n': v = '\n'; break; + case 'r': v = '\r'; break; + case 't': v = '\t'; break; + case 'v': v = '\v'; break; + + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + v = c - '0'; + c = pgetc(); + if (c >= '0' && c <= '7') { + v <<= 3; + v += c - '0'; + c = pgetc(); + if (c >= '0' && c <= '7') { + v <<= 3; + v += c - '0'; + } else + pungetc(); + } else + pungetc(); + break; + + case 'c': + c = pgetc(); + if (c < 0x3f || c > 0x7a || c == 0x60) + synerror("Bad \\c escape sequence"); + if (c == '\\' && pgetc() != '\\') + synerror("Bad \\c\\ escape sequence"); + if (c == '?') + v = 127; + else + v = c & 0x1f; + break; + + case 'x': + n = 2; + goto hexval; + case 'u': + n = 4; + goto hexval; + case 'U': + n = 8; + hexval: + v = 0; + for (i = 0; i < n; i++) { + c = pgetc(); + if (c >= '0' && c <= '9') + v = (v << 4) + c - '0'; + else if (c >= 'A' && c <= 'F') + v = (v << 4) + c - 'A' + 10; + else if (c >= 'a' && c <= 'f') + v = (v << 4) + c - 'a' + 10; + else { + pungetc(); + break; + } + } + if (n > 2 && v > 127) { + if (v >= 0xd800 && v <= 0xdfff) + synerror("Invalid \\u escape sequence"); + + /* XXX should we use iconv here. What locale? */ + CHECKSTRSPACE(4, out); + + if (v <= 0x7ff) { + USTPUTC(0xc0 | v >> 6, out); + USTPUTC(0x80 | (v & 0x3f), out); + return out; + } else if (v <= 0xffff) { + USTPUTC(0xe0 | v >> 12, out); + USTPUTC(0x80 | ((v >> 6) & 0x3f), out); + USTPUTC(0x80 | (v & 0x3f), out); + return out; + } else if (v <= 0x10ffff) { + USTPUTC(0xf0 | v >> 18, out); + USTPUTC(0x80 | ((v >> 12) & 0x3f), out); + USTPUTC(0x80 | ((v >> 6) & 0x3f), out); + USTPUTC(0x80 | (v & 0x3f), out); + return out; + } + if (v > 127) + v = '?'; + } + break; + default: + synerror("Unknown $'' escape sequence"); + } + vc = (char)v; + + /* + * If we managed to create a \n from a \ sequence (no matter how) + * then we replace it with the magic CRTCNL control char, which + * will turn into a \n again later, but in the meantime, never + * causes LINENO increments. + */ + if (vc == '\n') { + USTPUTC(CTLCNL, out); + return out; + } + + /* + * We can't handle NUL bytes. + * POSIX says we should skip till the closing quote. + */ + if (vc == '\0') { + while ((c = pgetc()) != '\'') { + if (c == '\\') + c = pgetc(); + if (c == PEOF) + synerror("Unterminated quoted string"); + if (c == '\n') { + plinno++; + if (doprompt) + setprompt(2); + else + setprompt(0); + } + } + pungetc(); + return out; + } + if (SQSYNTAX[vc] == CCTL) + USTPUTC(CTLESC, out); + USTPUTC(vc, out); + return out; +} /* * The lowest level basic tokenizer. @@ -1623,9 +1783,16 @@ readtoken1(int firstc, char const *syn, setprompt(0); continue; + case CSBACK: /* single quoted backslash */ + if ((quoted & QF) == CQ) { + out = readcstyleesc(out); + continue; + } + /* FALLTHROUGH */ case CWORD: USTPUTC(c, out); continue; + case CCTL: if (!magicq || ISDBLQUOTE()) USTPUTC(CTLESC, out); @@ -1826,10 +1993,7 @@ parsesub: { static const char types[] = "}-+?="; c = pgetc_linecont(); - if (c != '('/*)*/ && c != OPENBRACE && !is_name(c) && !is_special(c)) { - USTPUTC('$', out); - pungetc(); - } else if (c == '('/*)*/) { /* $(command) or $((arith)) */ + if (c == '(' /*)*/) { /* $(command) or $((arith)) */ if (pgetc_linecont() == '(' /*')'*/ ) { out = insert_elided_nl(out); PARSEARITH(); @@ -1838,7 +2002,7 @@ parsesub: { pungetc(); out = parsebackq(stack, out, &bqlist, 0, magicq); } - } else { + } else if (c == OPENBRACE || is_name(c) || is_special(c)) { USTPUTC(CTLVAR, out); typeloc = out - stackblock(); USTPUTC(VSNORMAL, out); @@ -1974,6 +2138,15 @@ parsesub: { CLRDBLQUOTE(); } } + } else if (c == '\'' && syntax == BASESYNTAX) { + USTPUTC(CTLQUOTEMARK, out); + quotef = 1; + TS_PUSH(); + syntax = SQSYNTAX; + quoted = CQ; + } else { + USTPUTC('$', out); + pungetc(); } goto parsesub_return; } Index: src/bin/sh/parser.h diff -u src/bin/sh/parser.h:1.23 src/bin/sh/parser.h:1.24 --- src/bin/sh/parser.h:1.23 Fri Jun 30 23:02:56 2017 +++ src/bin/sh/parser.h Mon Aug 21 13:20:49 2017 @@ -1,4 +1,4 @@ -/* $NetBSD: parser.h,v 1.23 2017/06/30 23:02:56 kre Exp $ */ +/* $NetBSD: parser.h,v 1.24 2017/08/21 13:20:49 kre Exp $ */ /*- * Copyright (c) 1991, 1993 @@ -46,9 +46,10 @@ #define CTLENDARI '\207' #define CTLQUOTEMARK '\210' #define CTLQUOTEEND '\211' /* only inside ${...} */ -#define CTLNONL '\212' /* The \n in a deleted \ \n sequence */ +#define CTLNONL '\212' /* The \n in a deleted \ \n sequence */ /* pure concidence that (CTLNONL & 0x7f) == '\n' */ -#define CTL_LAST '\212' /* last 'special' character */ +#define CTLCNL '\213' /* A $'\n' - newline not counted */ +#define CTL_LAST '\213' /* last 'special' character */ /* variable substitution byte (follows CTLVAR) */ #define VSTYPE 0x0f /* type of variable substitution */ Index: src/bin/sh/sh.1 diff -u src/bin/sh/sh.1:1.163 src/bin/sh/sh.1:1.164 --- src/bin/sh/sh.1:1.163 Tue Jul 25 08:37:48 2017 +++ src/bin/sh/sh.1 Mon Aug 21 13:20:49 2017 @@ -1,4 +1,4 @@ -.\" $NetBSD: sh.1,v 1.163 2017/07/25 08:37:48 wiz Exp $ +.\" $NetBSD: sh.1,v 1.164 2017/08/21 13:20:49 kre Exp $ .\" Copyright (c) 1991, 1993 .\" The Regents of the University of California. All rights reserved. .\" @@ -31,7 +31,7 @@ .\" .\" @(#)sh.1 8.6 (Berkeley) 5/4/95 .\" -.Dd July 15, 2017 +.Dd August 20, 2017 .Dt SH 1 .\" everything except c o and s (keep them ordered) .ds flags abCEeFfhIiLmnpquVvx @@ -528,8 +528,12 @@ The following is a list of operators: .Ss Quoting Quoting is used to remove the special meaning of certain characters or words to the shell, such as operators, whitespace, or keywords. -There are three types of quoting: matched single quotes, -matched double quotes, and backslash. +There are four types of quoting: +matched single quotes, +matched double quotes, +backslash, +and +dollar preceding matched single quotes (enhanced C style strings.) .Ss Backslash An unquoted backslash preserves the literal meaning of the following character, with the exception of @@ -554,6 +558,192 @@ quote only the following characters (and .Dl $ ` \*q \e <newline> , where a backslash newline is a line continuation as above. Otherwise it remains literal. +.Ss Dollar Single Quotes (\&$'...') +.Bd -filled -offset indent +.Bf Em +Note: this form of quoting is still somewhat experimental, +and yet to be included in the +.Tn POSIX +standard. +This implementation is based upon the current proposals for +standardization, and is subject to change should the eventual +adopted text differ. +.Ef +.Ed +.Pp +Enclosing characters in a matched pair of single quotes, with the +first immediately preceded by an unquoted dollar sign +.Pq \&$ +provides a quoting mechanism similar to single quotes, except +that within the sequence of characters, any backslash +.Pq \e , +is an escape character, which causes the following character to +be treated specially. +Only a subset of the characters that can occur in the string +are defined after a backslash, others are reserved for future +definition, and currently generate a syntax error if used. +The escape sequences are modeled after the similar sequences +in strings in the C programming language, with some extensions. +.Pp +The following characters are treated literally when following +the escape character (backslash): +.Dl \e \&' \&" +The sequence +.Dq \e\e +allows the escape character (backslash) to appear in the string literally. +.Dq \e' +allows a single quote character into the string, such an +escaped single quote does not terminate the quoted string. +.Dq \e" +is for compatibility with C strings, the double quote has +no special meaning in a shell C-style string, +and does not need to be escaped, but may be. +.Pp +A newline following the escape character is treated as a line continuation, +like the same sequence in a double quoted string, +or when not quoted \(en +the two characters, escape and newline, are removed from the input string. +.Pp +The following characters, when escaped, are converted in a +manner similar to the way they would be in a string in the C language: +.Dl a b e f n r t v +An escaped +.Sq a +generates an alert (or +.Sq Tn BEL ) +character, that is, control-G, or 0x07. +In a similar way, +.Sq b +is backspace (0x08), +.Sq e +(an extension to C) is escape (0x1B), +.Sq f +is form feed (0x0C), +.Sq n +is newline (or line feed, 0x0A), +.Sq r +is return (0x0D), +.Sq t +is horizontal tab (0x09), +and +.Sq v +is vertical tab (0x13). +.Pp +In addition to those there are 5 forms that need additional +data, which is obtained from the subsequent characters. +An escape +.Pq \e +followed by one, two or three, octal digits +.Po So 0 Sc Ns \&.. Ns So 7 Sc Ns Pc +is processed to form an 8 bit character value. +If only one or two digits are present, the following +character must be something other than an octal digit. +It is safest to always use all 3 digits, with leading +zeroes if needed. +If all three digits are present, the first must be one of +.So 0 Sc Ns \&.. Ns So 3 Sc . +.Pp +An escape followed by +.Sq x +(lower case only) can be followed by one or two +hexadecimal digits +.Po So 0 Sc Ns \&.. Ns So 9 Sc Ns , So A Sc Ns \&.. Ns So F Sc Ns , or So a Sc Ns \&.. Ns So f Sc Ns . Pc +As with octal, if only one hex digit is present, the following +character must be something other than a hex digit, +so always giving 2 hex digits is best. +However, unlike octal, it is unspecified in the standard +how many hex digits can be consumed. +This +.Nm +takes at most two, but other shells will continue consuming +characters as long as they remain valid hex digits. +Consequently, users should ensure that the character +following the hex escape sequence is something other than +a hex digit. +One way to achieve this is to end the $'...' string immediately +after the final hex digit, and then, immediately start +another, so +.Dl \&$'\ex33'$'4...' +always gives the character with value 0x33 +.Pq Sq 3 , +followed by the character +.Sq 4 , +whereas +.Dl \&$'\ex334' +in some other shells would be the hex value 0x334 (10, or more, bits). +.Pp +There are two escape sequences beginning with +.Sq \eu +or +.Sq \eU . +The former is followed by from 1 to 4 hex digits, the latter by +from 1 to 8 hex digits. +Leading zeroes can be used to pad the sequences to the maximum +permitted length, to avoid any possible ambiguity problem with +the following character, and because there are some shells that +insist on exactly 4 (or 8) hex digits. +These sequences are evaluated to form the value of a Unicode code +point, which is then encoded into UTF-8 form, and entered into the +string. +(The code point should be converted to the appropriate +code point value for the corresponding character in the character +set given by the current locale, or perhaps the locale in use +when the shell was started, but is not... currently.) +Not all values that are possible to write are valid, values that +specify (known) invalid Unicode code points will be rejected, or +simply produce +.Sq \&? . +.Pp +Lastly, as another addition to what is available in C, the escape +character (backslash), followed by +.Sq c +(lower case only) followed by one additional character, which must +be an alphabetic character (a letter), or one of the following: +.Dl \&@ \&[ \&\e \&] \&^ \&_ \&? +Other than +.Sq \ec? +the value obtained is the least significant 5 bits of the +.Tn ASCII +value of the character following the +.Sq \ec +escape sequence. +That is what is commonly known as the +.Dq control +character obtained from the given character. +The escape sequence +.Sq \ec? +yields the +.Tn ASCII +.Tn DEL +character (0x7F). +Note that to obtain the +.Tn ASCII +.Tn FS +character (0x1C) this way, +.Pq "that is control-\e" +the trailing +.Sq \e +must be escaped itself, and so for this one case, the full +escape sequence is +.Dq \ec\e\e . +The sequence +.Dq \ec\eX +where +.Sq X +is some character other than +.Sq \e +is reserved for future use, its meaning is unspecified. +In this +.Nm +an error is generated. +.Pp +If any of the preceding escape sequences generate the value +.Sq \e0 +(a nul character) that character, and all that follow in the +same $'...' string, are omitted from the resulting word. +.Pp +After the $'...' string has had any included escape sequences +converted, it is treated as if it had been a single quoted string. .Ss Reserved Words Reserved words are words that have special meaning to the shell and are recognized at the beginning of a line and @@ -634,7 +824,7 @@ In general, redirections open, close, or existing reference to a file. The overall format used for redirection is: .Pp -.Dl [n] Va redir-op Ar file +.Dl [n] Ns Va redir-op Ar file .Pp where .Va redir-op Index: src/bin/sh/syntax.c diff -u src/bin/sh/syntax.c:1.4 src/bin/sh/syntax.c:1.5 --- src/bin/sh/syntax.c:1.4 Wed Jun 7 05:08:32 2017 +++ src/bin/sh/syntax.c Mon Aug 21 13:20:49 2017 @@ -1,7 +1,7 @@ -/* $NetBSD: syntax.c,v 1.4 2017/06/07 05:08:32 kre Exp $ */ +/* $NetBSD: syntax.c,v 1.5 2017/08/21 13:20:49 kre Exp $ */ #include <sys/cdefs.h> -__RCSID("$NetBSD: syntax.c,v 1.4 2017/06/07 05:08:32 kre Exp $"); +__RCSID("$NetBSD: syntax.c,v 1.5 2017/08/21 13:20:49 kre Exp $"); #include <limits.h> #include "shell.h" @@ -63,6 +63,7 @@ const char sqsyntax[257] = { CEOF, set_range(CTL_FIRST, CTL_LAST, CCTL) set('\n', CNL) set('\'', CSQUOTE) + set('\\', CSBACK) /* ':/' for tilde expansion, '-' for [a\-x] pattern ranges */ set('!', CCTL) set('*', CCTL) Index: src/bin/sh/syntax.h diff -u src/bin/sh/syntax.h:1.8 src/bin/sh/syntax.h:1.9 --- src/bin/sh/syntax.h:1.8 Wed Jun 7 05:08:32 2017 +++ src/bin/sh/syntax.h Mon Aug 21 13:20:49 2017 @@ -1,4 +1,4 @@ -/* $NetBSD: syntax.h,v 1.8 2017/06/07 05:08:32 kre Exp $ */ +/* $NetBSD: syntax.h,v 1.9 2017/08/21 13:20:49 kre Exp $ */ /*- * Copyright (c) 1991, 1993 @@ -49,6 +49,7 @@ #define CEOF 10 /* end of file */ #define CCTL 11 /* like CWORD, except it must be escaped */ #define CSPCL 12 /* these terminate a word */ +#define CSBACK 13 /* a backslash in a single quote syntax */ /* Syntax classes for is_ functions */ #define ISDIGIT 01 /* a digit */ @@ -56,7 +57,7 @@ #define ISLOWER 04 /* a lower case letter */ #define ISUNDER 010 /* an underscore */ #define ISSPECL 020 /* the name of a special parameter */ -#define ISSPACE 040 /* a white space character */ +#define ISSPACE 040 /* a white space character */ #define PEOF (CHAR_MIN - 1) #define SYNBASE (-PEOF) @@ -69,15 +70,15 @@ /* These defines assume that the digits are contiguous (which is guaranteed) */ #define is_digit(c) ((unsigned)((c) - '0') <= 9) -#define sh_ctype(c) (is_type+SYNBASE)[(int)(c)] +#define sh_ctype(c) (is_type+SYNBASE)[(int)(c)] #define is_upper(c) (sh_ctype(c) & ISUPPER) #define is_lower(c) (sh_ctype(c) & ISLOWER) #define is_alpha(c) (sh_ctype(c) & (ISUPPER|ISLOWER)) #define is_name(c) (sh_ctype(c) & (ISUPPER|ISLOWER|ISUNDER)) #define is_in_name(c) (sh_ctype(c) & (ISUPPER|ISLOWER|ISUNDER|ISDIGIT)) -#define is_special(c) (sh_ctype(c) & (ISSPECL|ISDIGIT)) -#define is_space(c) (sh_ctype(c) & ISSPACE) -#define digit_val(c) ((c) - '0') +#define is_special(c) (sh_ctype(c) & (ISSPECL|ISDIGIT)) +#define is_space(c) (sh_ctype(c) & ISSPACE) +#define digit_val(c) ((c) - '0') extern const char basesyntax[]; extern const char dqsyntax[];