Module Name:    src
Committed By:   apb
Date:           Thu Oct 23 14:19:33 UTC 2014

Modified Files:
        src/distrib/sets: join.awk

Log Message:
Add vis() function, and canonicalise file names via vis(unvis($1)).

XXX: The vis() function is very limited, due to the absence of ord()
in NetBSD's awk.


To generate a diff of this commit:
cvs rdiff -u -r1.4 -r1.5 src/distrib/sets/join.awk

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/distrib/sets/join.awk
diff -u src/distrib/sets/join.awk:1.4 src/distrib/sets/join.awk:1.5
--- src/distrib/sets/join.awk:1.4	Tue Oct 21 23:15:38 2014
+++ src/distrib/sets/join.awk	Thu Oct 23 14:19:33 2014
@@ -1,4 +1,4 @@
-#	$NetBSD: join.awk,v 1.4 2014/10/21 23:15:38 apb Exp $
+#	$NetBSD: join.awk,v 1.5 2014/10/23 14:19:33 apb Exp $
 #
 # Copyright (c) 2002 The NetBSD Foundation, Inc.
 # All rights reserved.
@@ -30,7 +30,8 @@
 # join.awk F1 F2
 #	Similar to join(1), this reads a list of words from F1
 #	and outputs lines in F2 with a first word that is in F1.
-#	Neither file needs to be sorted
+#	The first word is canonicalised via vis(unvis(word))).
+#	Neither file needs to be sorted.
 
 function unvis(s) \
 {
@@ -53,7 +54,7 @@ function unvis(s) \
 			s = substr(s, 3)
 		} else if (match(s, "\\\\[0-7][0-7][0-7]") == 1) {
 			# \ooo with three octal digits.
-			# XXX: use strnum() is that is available
+			# XXX: use strtonum() when that is available
 			unvis_result = unvis_result "" sprintf("%c", \
 				0+substr(s, 2, 1) * 64 + \
 				0+substr(s, 3, 1) * 8 + \
@@ -72,6 +73,60 @@ function unvis(s) \
 	return unvis_result
 }
 
+function vis(s) \
+{
+	# We need to encode backslash, space, and tab, because they
+	# would interfere with scripts that attempt to manipulate
+	# the set files.
+	#
+	# We make no attempt to encode shell special characters
+	# such as " ' $ ( ) { } [ ] < > * ?, because nothing that
+	# parses set files would need that.
+	#
+	# We would like to handle other white space or non-graph
+	# characters, because they may be confusing for human readers,
+	# but they are too difficult to handle in awk without the ord()
+	# function, so we print an error message.
+	#
+	# As of October 2014, no files in the set lists contain
+	# characters that would need any kind of encoding.
+	#
+	vis_result = ""
+	while (length(s) > 0) {
+		vis_pos = match(s, "(\\\\|[[:space:]]|[^[:graph:]])")
+		if (vis_pos == 0) {
+			vis_result = vis_result "" s
+			s = ""
+			break
+		}
+		# copy the part before the next special char
+		vis_result = vis_result "" substr(s, 1, vis_pos - 1)
+		vis_char = substr(s, vis_pos, 1)
+		s = substr(s, vis_pos + 1)
+		# process the special char
+		if (vis_char == "\\") {
+			# backslash -> double backslash
+			vis_result = vis_result "\\\\"
+		} else if (vis_char == " ") {
+			# space -> \040
+			vis_result = vis_result "\\040"
+		} else if (vis_char == "\t") {
+			# tab -> \011
+			vis_result = vis_result "\\011"
+		} else {
+			# generalised \ooo with three octal digits.
+			# XXX: I don't know how to do this in awk without ord()
+			printf "%s: %s:%s: cannot perform vis encoding\n", \
+				ARGV[0], (FILENAME ? FILENAME : "stdin"), FNR \
+				>"/dev/stderr"
+			vis_result = vis_result "" vis_char
+		}
+	}
+	return vis_result
+}
+
+// { $1 = vis(unvis($1)); print }
+
 BEGIN \
 {
 	if (ARGC != 3) {
@@ -79,13 +134,13 @@ BEGIN \
 		exit 1
 	}
 	while ( (getline < ARGV[1]) > 0) {
-		$1 = unvis($1)
+		$1 = vis(unvis($1))
 		words[$1] = $0
 	}
 	delete ARGV[1]
 }
 
-// { $1 = unvis($1) }
+// { $1 = vis(unvis($1)) }
 
 $1 in words \
 {

Reply via email to