commit:     e565696396aec1826a27bb6dfacf3b56cb369d62
Author:     Kerin Millar <kfm <AT> plushkava <DOT> net>
AuthorDate: Tue Jan 27 03:09:17 2026 +0000
Commit:     Sam James <sam <AT> gentoo <DOT> org>
CommitDate: Tue Jan 27 09:48:28 2026 +0000
URL:        
https://gitweb.gentoo.org/proj/gentoo-functions.git/commit/?id=e5656963

Introduce the shquote utility

As concerns the "functions.sh" unit, it provides a function named
quote_args(), the purpose of which is to transform its arguments in such
a way that they may safely be reused as input.

It has two implementations, one of which takes advantage of ${param@Q}
expansion in bash, and the other of which is implemented in awk. Of
these, the latter implementation is rather slow.

This commit introduces a utility by the name of "shquote" and integrates
it into the quote_args() function. It is written in C and is
significantly faster than the awk implementation that it replaces. In
addition to improving the performance, it requotes each argument in a
context-sensitive fashion, achieving optimal aesthetics.

$ utf8=$(printf '\345\223\210\347\275\227')
$ invalid_utf8=$(printf '\303\050')
$ quote_args foo bar 'baz quux' $'hi\nthere' "$invalid_utf8" "$utf8"
foo bar 'baz quux' $'hi\nthere' $'\303(' 哈罗

As before, dollar-single quoting may be suppressed by setting the
unfortunately named POSIXLY_CORRECT variable. I would have preferred to
remove this feature but am stymied by the fact that dash-0.5.13 remains
masked in Gentoo.

The utility is derived from code written by Leah Neukirchen and Rich
Felker, and is subject to the MIT license.

Signed-off-by: Kerin Millar <kfm <AT> plushkava.net>

 functions.sh   |  56 +------------------
 meson.build    |  14 ++++-
 shquote.c      | 168 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 test-functions |  36 +++++++++----
 4 files changed, 209 insertions(+), 65 deletions(-)

diff --git a/functions.sh b/functions.sh
index 1bb5ead..ec2de56 100644
--- a/functions.sh
+++ b/functions.sh
@@ -482,61 +482,9 @@ quote_args()
        # shellcheck disable=3028
        if [ ! "${POSIXLY_CORRECT}" ] && [ "${BASH_VERSINFO-0}" -ge 5 ]; then
                _quote_args_bash "$@"
-               return
+       else
+               shquote "$@"
        fi
-       LC_ALL=C awk -v q=\' -f - -- "$@" <<-'EOF'
-       function init_table() {
-               # Iterate over ranges \001-\037 and \177-\377.
-               for (i = 1; i <= 255; i += (i == 31 ? 96 : 1)) {
-                       char = sprintf("%c", i)
-                       seq_by[char] = sprintf("%03o", i)
-               }
-               seq_by["\007"] = "a"
-               seq_by["\010"] = "b"
-               seq_by["\011"] = "t"
-               seq_by["\012"] = "n"
-               seq_by["\013"] = "v"
-               seq_by["\014"] = "f"
-               seq_by["\015"] = "r"
-               seq_by["\033"] = "e"
-               seq_by["\047"] = "'"
-               seq_by["\134"] = "\\"
-       }
-       BEGIN {
-               issue = length(ENVIRON["POSIXLY_CORRECT"]) ? 7 : 8;
-               argc = ARGC
-               ARGC = 1
-               for (arg_idx = 1; arg_idx < argc; arg_idx++) {
-                       arg = ARGV[arg_idx]
-                       if (arg == q) {
-                               word = "\\" q
-                       } else if (issue < 8 || arg !~ /[\001-\037\177-\377]/) {
-                               gsub(q, q "\\" q q, arg)
-                               word = q arg q
-                       } else {
-                               # Use $'' quoting per POSIX-1.2024.
-                               if (! ("\001" in seq_by)) {
-                                       init_table()
-                               }
-                               word = "$'"
-                               for (i = 1; i <= length(arg); i++) {
-                                       char = substr(arg, i, 1)
-                                       if (char in seq_by) {
-                                               word = word "\\" seq_by[char]
-                                       } else {
-                                               word = word char
-                                       }
-                               }
-                               word = word q
-                       }
-                       line = line word
-                       if (arg_idx < argc - 1) {
-                               line = line " "
-                       }
-               }
-               print line
-       }
-       EOF
 }
 
 #

diff --git a/meson.build b/meson.build
index bcb4705..4be5cc1 100644
--- a/meson.build
+++ b/meson.build
@@ -1,7 +1,10 @@
 project(
   'gentoo-functions', 'c',
   version: '1.7.3',
-  license: 'GPL-2.0-only',
+  license: [
+    'GPL-2.0-only',
+    'MIT'
+  ],
   default_options : [
     'warning_level=2',
     'c_std=gnu11',
@@ -26,6 +29,12 @@ executable(
   install: true
 )
 
+executable(
+  'shquote',
+  'shquote.c',
+  install: true
+)
+
 install_man(
   'consoletype.1',
 )
@@ -36,6 +45,7 @@ if do_tests
     'test-functions', files('test-functions'),
     workdir : meson.current_source_dir(),
     protocol : 'tap',
-    verbose : true
+    verbose : true,
+    env : { 'BUILD_DIR' : meson.current_build_dir() }
   )
 endif

diff --git a/shquote.c b/shquote.c
new file mode 100644
index 0000000..e13d497
--- /dev/null
+++ b/shquote.c
@@ -0,0 +1,168 @@
+/*
+ * shquote - intelligently quotes arguments for use as shell input
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * This software is derived from Leah Neukirchen's lr utility.
+ */
+
+/*
+ * Copyright (C) 2025 Kerin Millar
+ * Copyright (C) 2015-2025 Leah Neukirchen <purl.org/net/chneukirchen>
+ * Parts of code derived from musl libc, which is
+ * Copyright (C) 2005-2014 Rich Felker, et al.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#define _POSIX_C_SOURCE 200809L
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+static int esc_mode = 2;
+
+static void print_shquoted(const char *s);
+static int u8decode(const char *cs, uint32_t *cp);
+
+int
+main(int argc, char *argv[])
+{
+       char *var = getenv("POSIXLY_CORRECT");
+       if (var != NULL && strlen(var))
+               /* Disallow dollar-single quoting. */
+               esc_mode = 1;
+
+       for (int i = 1; i < argc; i++) {
+               if (i > 1)
+                       putchar(' ');
+               print_shquoted(argv[i]);
+       }
+       putchar('\n');
+       return 0;
+}
+
+static void
+print_shquoted(const char *s)
+{
+       uint32_t ignored;
+       int l;
+
+       const char *t;
+       int esc = 0;
+
+       for (t = s; *t; ) {
+               if ((unsigned char)*t < 32 || strchr("'\177", *t)) {
+                       esc = esc_mode;
+                       break;
+               } else if (strchr("`^#*[]=|\\?${}()\"<>&;~\040", *t)) {
+                       /* Bias towards single quoting. */
+                       esc = 1;
+                       if (esc == esc_mode)
+                               break;
+                       t += 1;
+               } else {
+                       if ((l = u8decode(t, &ignored)) < 0) {
+                               /* Invalid UTF-8 byte sequence encountered. */
+                               esc = esc_mode;
+                               break;
+                       }
+                       t += l;
+               }
+       }
+
+       switch (esc) {
+       case 0:
+               /* Convey verbatim. */
+               printf("%s", s);
+               break;
+       case 1:
+               /* Employ single quoting. */
+               putchar('\'');
+               for (; *s; s++)
+                       if (*s == '\'')
+                               printf("'\\''");
+                       else
+                               putchar(*s);
+               putchar('\'');
+               break;
+       case 2:
+               /* Employ dollar-single quoting. */
+               printf("$'");
+               for (; *s; s++)
+                       switch (*s) {
+                       case '\a': printf("\\a"); break;
+                       case '\b': printf("\\b"); break;
+                       case '\e': printf("\\e"); break;
+                       case '\f': printf("\\f"); break;
+                       case '\n': printf("\\n"); break;
+                       case '\r': printf("\\r"); break;
+                       case '\t': printf("\\t"); break;
+                       case '\v': printf("\\v"); break;
+                       case '\\': printf("\\\\"); break;
+                       case '\'': printf("\\\'"); break;
+                       default:
+                               if ((unsigned char)*s < 32
+                                       || (unsigned char)*s == 127
+                                       || (l = u8decode(s, &ignored)) < 0) {
+                                       printf("\\%03o", (unsigned char)*s);
+                               } else {
+                                       printf("%.*s", l, s);
+                                       s += l-1;
+                               }
+                       }
+               putchar('\'');
+       }
+}
+
+/* Decode one UTF-8 codepoint into cp, return number of bytes to next one.
+ * On invalid UTF-8, return -1, and do not change cp.
+ * Invalid codepoints are not checked.
+ *
+ * This code is meant to be inlined, if cp is unused it can be optimized away.
+ */
+static int
+u8decode(const char *cs, uint32_t *cp)
+{
+       const uint8_t *s = (uint8_t *)cs;
+
+       if (*s == 0)   { *cp = 0; return 0; }
+       if (*s < 0x80) { *cp = *s; return 1; }
+       if (*s < 0xc2) { return -1; }  /*cont+overlong*/
+       if (*s < 0xe0) { *cp = *s & 0x1f; goto u2; }
+       if (*s < 0xf0) {
+               if (*s == 0xe0 && (s[1] & 0xe0) == 0x80) return -1; /*overlong*/
+               if (*s == 0xed && (s[1] & 0xe0) == 0xa0) return -1; 
/*surrogate*/
+               *cp = *s & 0x0f; goto u3;
+       }
+       if (*s < 0xf5) {
+               if (*s == 0xf0 && (s[1] & 0xf0) == 0x80) return -1; /*overlong*/
+               if (*s == 0xf4 && (s[1] > 0x8f)) return -1; /*too high*/
+               *cp = *s & 0x07; goto u4;
+       }
+       return -1;
+
+u4:    if ((*++s & 0xc0) != 0x80) return -1;  *cp = (*cp << 6) | (*s & 0x3f);
+u3:    if ((*++s & 0xc0) != 0x80) return -1;  *cp = (*cp << 6) | (*s & 0x3f);
+u2:    if ((*++s & 0xc0) != 0x80) return -1;  *cp = (*cp << 6) | (*s & 0x3f);
+       return s - (uint8_t *)cs + 1;
+}

diff --git a/test-functions b/test-functions
index feb46f5..4bd7643 100755
--- a/test-functions
+++ b/test-functions
@@ -996,9 +996,15 @@ test_quote_args() {
        set -- eq 0
 
        callback() {
-               local POSIXLY_CORRECT cksum fmt i str
+               local expected_cksum cksum fmt i str
 
-               test_description="quote_args output test (expecting cksum 
380900690)"
+               if [ "${BASH}" ]; then
+                       expected_cksum=380900690
+               else
+                       expected_cksum=1492849101
+               fi
+
+               test_description="quote_args output test (expecting cksum 
${expected_cksum})"
                i=0
                # The generator fails to produce the correct ouput in yash
                # unless the effective character type is C/POSIX. However, once
@@ -1006,14 +1012,17 @@ test_quote_args() {
                # if in its posix mode. As things stand, there is little point
                # in fixing it because yash also disables the local builtin in
                # its posix mode, causing test-functions to bail out sooner.
-               while [ "$((i += 1))" -le 255 ]; do
-                       fmt=$(printf '\\%o' "$i")
-                       # shellcheck disable=2059
-                       str=$(printf "$fmt.")
-                       quote_args "${str%.}" || break
-               done \
+               {
+                       POSIXLY_CORRECT=
+                       while [ "$((i += 1))" -le 255 ]; do
+                               fmt=$(printf '\\%o' "$i")
+                               # shellcheck disable=2059
+                               str=$(printf "${fmt}.")
+                               quote_args "${str%.}" || break
+                       done
+               } \
                | cksum \
-               | { read -r cksum _ && test "${cksum}" = "380900690"; }
+               | { read -r cksum _ && test "${cksum}" = "${expected_cksum}"; }
        }
 
        iterate_tests 2 "$@"
@@ -1242,6 +1251,15 @@ elif ! GENFUN_MODULES="portage rc" . ./functions.sh; then
        bailout "Couldn't source ./functions.sh"
 else
        assign_tmpdir
+
+       # Since the test suite is normally executed during the src_test phase,
+       # the shquote utility will not yet have been installed. Account for
+       # that by redefining the quote_args() function.
+       # shellcheck disable=3028
+       if [ "${EBUILD_PHASE}" = test ] && [ "${BASH_VERSINFO-0}" -lt 5 ]; then
+               quote_args() { "${BUILD_DIR:?}"/shquote "$@"; }
+       fi
+
        test_chdir || rc=1
        test_ebegin || rc=1
        test_is_older_than || rc=1

Reply via email to