gbranden pushed a commit to branch master
in repository groff.
commit 6008b6b7aa2920035e09d1dea44d262d30391195
Author: G. Branden Robinson <[email protected]>
AuthorDate: Thu Jan 18 12:45:57 2024 -0600
[troff]: Diagnose bogus composite char escapes.
[troff]: Diagnose bogus composite character escape sequences. That is,
when a composite character escape sequence like \[a ~] has a bogus
modifier (as opposed to base) character, meaning one that has not been
defined as the source _or_ destination of a `composite` request, warn
about it. For instance, \[a $] is nonsense, barring a request like
`.composite $ \[uFF00]`, which would map `$`, when used as a modifier
character in a composite special character escape sequence, to U+FF00,
which would be a modifier form of the dollar sign in an alternate
universe.
* src/roff/troff/input.cpp (is_codepoint_composite): New function
searches `composite_dictionary` for the presence of the given
four-digit hexadecimal string as a key _or_ value.
* src/roff/troff/input.h: Expose foregoing function to other translation
units.
* src/roff/troff/node.cpp (make_glyph_node): Check input `charinfo` for
a Unicode code point sequence, and if it contains one, call
`valid_unicode_code_sequence()` to check it for validity. Then,
iterate through each code point after the first {the base character},
and call `is_codepoint_composite()` on it. Diagnose invalid composite
character and return null pointer if validation fails.
Input:
.nf
\[A a~]
\[A ~]
\[u0041_0301]
\[u0041_007E] \" should fail because 007E is explicitly spacing
\[u0041_0041] \" same reason, more obviously
\[u0041_0301_0301] \" should fail, would have a different meaning
\[u0041_007E_0301] \" both problems above
groff 1.23.0 and earlier:
$ groff -T ps -z EXPERIMENTS/composite_character_construction.groff
troff:...:5: warning: special character 'u0041_007E' not defined
troff:...:6: warning: special character 'u0041_0041' not defined
troff:...:7: warning: special character 'u0041_0301_0301' not defined
troff:...:8: warning: special character 'u0041_007E_0301' not defined
$ groff -Tutf8 -z EXPERIMENTS/composite_character_construction.groff
[no output due to Savannah #65109]
Now:
$ ./build/test-groff -T ps -z
EXPERIMENTS/composite_character_construction.groff
troff:...:5: warning: special character 'u0041_007E' not defined
troff:...:6: error: cannot format glyph: 'u0041_0041' is not a valid
composite character
troff:...:7: warning: special character 'u0041_0301_0301' not defined
troff:...:8: warning: special character 'u0041_007E_0301' not defined
$ ./build/test-groff -T utf8 -z
EXPERIMENTS/composite_character_construction.groff
troff:...:6: error: cannot format glyph: 'u0041_0041' is not a valid
composite character
---
ChangeLog | 27 +++++++++++++++++++++++++++
src/roff/troff/input.cpp | 32 ++++++++++++++++++++++++++++++++
src/roff/troff/input.h | 1 +
src/roff/troff/node.cpp | 24 +++++++++++++++++++++++-
4 files changed, 83 insertions(+), 1 deletion(-)
diff --git a/ChangeLog b/ChangeLog
index e500142e4..023054db6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,30 @@
+2024-01-18 G. Branden Robinson <[email protected]>
+
+ [troff]: Diagnose bogus composite character escape sequences.
+ That is, when a composite character escape sequence like \[a ~]
+ has a bogus modifier (as opposed to base) character, meaning one
+ that has not been defined as the source _or_ destination of a
+ `composite` request, warn about it. For instance, \[a $] is
+ nonsense, barring a request like `.composite $ \[uFF00]`, which
+ would map `$`, when used as a modifier character in a composite
+ special character escape sequence, to U+FF00, which would be a
+ modifier form of the dollar sign in an alternate universe.
+
+ * src/roff/troff/input.cpp (is_codepoint_composite): New
+ function searches `composite_dictionary` for the presence of the
+ given four-digit hexadecimal string as a key _or_ value.
+
+ * src/roff/troff/input.h: Expose foregoing function to other
+ translation units.
+
+ * src/roff/troff/node.cpp (make_glyph_node): Check input
+ `charinfo` for a Unicode code point sequence, and if it contains
+ one, call `valid_unicode_code_sequence()` to check it for
+ validity. Then, iterate through each code point after the first
+ {the base character}, and call `is_codepoint_composite()` on it.
+ Diagnose invalid composite character and return null pointer if
+ validation fails.
+
2024-01-17 G. Branden Robinson <[email protected]>
* src/roff/troff/input.cpp (map_composite_character): Stop
diff --git a/src/roff/troff/input.cpp b/src/roff/troff/input.cpp
index 0cbec2f4d..94a977e38 100644
--- a/src/roff/troff/input.cpp
+++ b/src/roff/troff/input.cpp
@@ -4225,6 +4225,38 @@ static symbol composite_glyph_name(symbol nm)
return symbol(gl.contents());
}
+// Does the hexadecimal four-character sequence `n` represent a code
+// point with a composite mapping? Either the key or value component
+// of an entry in the composite dictionary qualifies.
+//
+// This is an O(n) search, but by default groff only defines 22
+// composite character mappings ("tmac/composite.tmac"). If this
+// becomes a performance problem, we will need another dictionary
+// mapping the unique values of `composite_dictionary` (which is not
+// one-to-one) to a Boolean.
+bool is_codepoint_composite(const char *n)
+{
+ bool result = false;
+ dictionary_iterator iter(composite_dictionary);
+ symbol key;
+ char *value;
+ while(iter.get(&key, reinterpret_cast<void **>(&value))) {
+ assert(!key.is_null());
+ assert(value != 0 /* nullptr */);
+ const char *k = key.contents();
+ if (strcmp(k, n) == 0) {
+ result = true;
+ break;
+ }
+ const char *v = reinterpret_cast<char *>(value);
+ if (strcmp(v, n) == 0) {
+ result = true;
+ break;
+ }
+ }
+ return result;
+}
+
static void report_composite_characters()
{
dictionary_iterator iter(composite_dictionary);
diff --git a/src/roff/troff/input.h b/src/roff/troff/input.h
index e78124f92..179feabd3 100644
--- a/src/roff/troff/input.h
+++ b/src/roff/troff/input.h
@@ -112,6 +112,7 @@ const int INPUT_SOFT_HYPHEN= 0312;
extern void do_glyph_color(symbol);
extern void do_fill_color(symbol);
+extern bool is_codepoint_composite(const char *n);
// Local Variables:
// fill-column: 72
diff --git a/src/roff/troff/node.cpp b/src/roff/troff/node.cpp
index 719bb2f3a..c7f9116bd 100644
--- a/src/roff/troff/node.cpp
+++ b/src/roff/troff/node.cpp
@@ -1,4 +1,4 @@
-/* Copyright (C) 1989-2020 Free Software Foundation, Inc.
+/* Copyright (C) 1989-2024 Free Software Foundation, Inc.
Written by James Clark ([email protected])
This file is part of groff.
@@ -36,6 +36,7 @@ along with this program. If not, see
<http://www.gnu.org/licenses/>. */
#include "charinfo.h"
#include "input.h"
#include "geometry.h"
+#include "unicode.h" // valid_unicode_code_sequence()
#include "nonposix.h"
@@ -4910,6 +4911,27 @@ static node *make_glyph_node(charinfo *s, environment
*env,
error("cannot format glyph: no current font");
return 0 /* nullptr */;
}
+ const char *seq = valid_unicode_code_sequence(s->nm.contents());
+ if (seq != 0 /* nullptr */) {
+ // If it is a multi-character sequence like u1234_5678, every code
+ // point after the first must have (or be) a composite mapping.
+ char codepoint[5] = { 0, 0, 0, 0, 0};
+ bool is_composite_glyph_valid = true;
+ while ((seq = strchr(seq, '_')) != 0 /* nullptr */) {
+ seq++;
+ (void) strncpy(codepoint, seq, 4);
+ if (!is_codepoint_composite(codepoint)) {
+ is_composite_glyph_valid = false;
+ break;
+ }
+ seq += 4;
+ }
+ if (!is_composite_glyph_valid) {
+ error("cannot format glyph: '%1' is not a valid composite"
+ " character", s->nm.contents());
+ return 0 /* nullptr */;
+ }
+ }
assert(fontno < font_table_size && font_table[fontno] != 0);
int fn = fontno;
bool found = font_table[fontno]->contains(s);
_______________________________________________
Groff-commit mailing list
[email protected]
https://lists.gnu.org/mailman/listinfo/groff-commit