gbranden pushed a commit to branch master
in repository groff.
commit b80cdf13e61e502bd6c3f86aab5accd15af0e47a
Author: G. Branden Robinson <[email protected]>
AuthorDate: Sat Apr 18 08:06:26 2026 -0500
src/roff/troff/input.cpp: Annotate future plans.
---
src/roff/troff/input.cpp | 51 ++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 51 insertions(+)
diff --git a/src/roff/troff/input.cpp b/src/roff/troff/input.cpp
index 5c876ace2..5e93e032a 100644
--- a/src/roff/troff/input.cpp
+++ b/src/roff/troff/input.cpp
@@ -505,6 +505,24 @@ bool file_iterator::next_file(FILE *f, const char *s)
return true;
}
+// TODO: Define a function, say, process_input_character().
+//
+// Delegate the actual work on inbounding a UTF-8 sequence from the
+// standard I/O stream to some gnulib module (research needed). Prepare
+// for exceptional conditions:
+// 1. EOF
+// 2. incomplete UTF-8 sequence
+// 3. invalid UTF-8 sequence (overlong encoding, outside code range)
+//
+// If an exceptional condition occurs, throw an exception of a type we
+// define; our callers must catch it. The result of the exception is
+// likely either to abort collection of the syntactical item being
+// collected (such as an identifier) and/or to decide we've reached the
+// end of input. Follow the pattern(s) of existing EOF handling.
+//
+// If no exception occurs, apply Normalization Form D (if gnulib
+// can't/doesn't do that), and return an std::vector<> of `char32_t`.
+
// Returns an unsigned char or `EOF`.
int file_iterator::fill(node **)
{
@@ -515,6 +533,7 @@ int file_iterator::fill(node **)
ptr = p;
unsigned char *e = p + BUF_SIZE;
while (p < e) {
+ // TODO: process_input_character()
int c = getc(fp);
if (EOF == c)
break;
@@ -542,9 +561,11 @@ int file_iterator::fill(node **)
int file_iterator::peek()
{
+ // TODO: process_input_character()
int c = getc(fp);
while (is_invalid_input_char(c)) {
warning(WARN_INPUT, "invalid input character code %1", c);
+ // TODO: process_input_character()
c = getc(fp);
}
if (c != EOF)
@@ -1150,6 +1171,32 @@ static symbol
read_increment_and_escape_sequence_parameter(int *incp)
// stream are typically read into the contents of an existing node (like
// a string or macro definition), or discarded. A handful of escape
// sequences (\n, etc.) interpolate as they do outside of copy mode.
+//
+// XXX: This is one of the places where the rubber meets the road in the
+// "migrate GNU troff from reading unsigned chars to UTF-8" project,
+// because it returns an `int` and therefore can encode `EOF`, which the
+// rest of the code uses in a traditional C-idiomatic way.
+//
+// That idiom seems bad for us: reading a UTF-8 sequence adds a whole
+// layer of additional state because situations like a UTF-8 sequence
+// being invalid (e.g., possessing an overlength encoding), incomplete,
+// or outside the encoding range can happen. Even if some gnulib module
+// nicely wraps up and handles all that madness for us (and I think/hope
+// it does), there are still going to be exceptional conditions that are
+// impossible with a single-byte character encoding where all code point
+// values are valid (for reading purposes--not necessarily to GNU
+// troff). To be useful, gnulib (or whatever external UTF-8-chomping
+// library) has to communicate error information up to the application.
+//
+// Due to the variety of exceptional conditions, we might want to throw
+// and catch exceptions instead.
+//
+// Another place (_the_ other place?) is of course reading an input
+// character _not_ in copy mode--in interpretation mode, if you will.
+// Unfortunately that is done ad hoc wherever a lexical analysis
+// function needs to pump the input stream. We might need a counterpart
+// function, read_character(), or to make this that function, with an
+// additional Boolean parameter with a default value of `false`.
static int read_character_in_copy_mode(node **nd,
bool is_defining,
bool handle_escaped_E)
@@ -9615,6 +9662,10 @@ static void transparent_throughput_file_request()
else {
bool is_at_beginning_of_input_line = true;
for (;;) {
+ // TODO: Decide what "transparency" means when attempting to
+ // copy UTF-8 input "safely", unlike `cf`, which dumbly slings
+ // bytes from input to output without interpretation (which
+ // can produce wildly invalid "grout").
int c = getc(fp);
if (EOF == c)
break;
_______________________________________________
groff-commit mailing list
[email protected]
https://lists.gnu.org/mailman/listinfo/groff-commit