Follow-up Comment #9, bug #40720 (group groff): [comment #8 comment #8:] > [comment #7 comment #7:] >> Unless I'm misunderstanding you and Ingo, > > Possibly, because the point made in this paragraph seems to have little > bearing on our proposal. > >> at the level GNU _troff_ needs to work on characters, I think >> it expects singleton items > > GNU troff can already, and has been able to for years, handle text full of > "characters" spelled like \[u1F913]. Whatever internal storage it uses is > already up to this task.
Right, that's called a _charinfo_, and it's a data structure associated with any character that can be formatted into a diversion (including the top-level one, a.k.a. "output"). The logic is in the `token::next()` member function defined in "input.cpp". There is [https://cgit.git.savannah.gnu.org/cgit/groff.git/tree/src/roff/troff/input.cpp?h=1.23.0#n1935 one handler for the form `\(xx`] and [https://cgit.git.savannah.gnu.org/cgit/groff.git/tree/src/roff/troff/input.cpp?h=1.23.0#n2283 another for the form `\[xxxxx]`]. > What it can't do is read UTF-8 input containing the character 🤓. It needs > preconv to convert this into \[u1F913]. Right, we can change that, and that's the objective of this ticket. > But that's only an _input_ problem. Once troff is past the input stage, if > it continues to store the character the same way it has stored \[u1F913] for > years, much of the code base can remain untouched. Only the part of troff > that reads input needs to change. There are several such parts. Here's an in-progress diff of the change to make the reading of input characters more type-correct. (This, in turn, is more appropriate for bug #67734, but alas, we seem cursed to commingle these discussions.) $ git diff | cat diff --git a/src/roff/troff/div.cpp b/src/roff/troff/div.cpp index fcd657b68..bde7d8c0f 100644 --- a/src/roff/troff/div.cpp +++ b/src/roff/troff/div.cpp @@ -1024,7 +1024,7 @@ void return_request() { vunits dist = curdiv->marked_place - curdiv->get_vertical_position(); if (has_arg()) { - if (tok.ch() == '-') { + if (tok.ch() == int('-')) { // TODO: grochar tok.next(); vunits x; if (get_vunits(&x, 'v')) diff --git a/src/roff/troff/env.cpp b/src/roff/troff/env.cpp index b47543adc..b1e6912b7 100644 --- a/src/roff/troff/env.cpp +++ b/src/roff/troff/env.cpp @@ -2735,7 +2735,8 @@ void adjust() { curenv->adjust_mode |= 1; if (has_arg()) { - switch (tok.ch()) { + int c = tok.ch(); // safely compares to char literals; TODO: grochar + switch (c) { case 'l': curenv->adjust_mode = ADJUST_LEFT; break; @@ -3009,7 +3010,8 @@ static void configure_tab_stops_request() bool is_repeating_stop = false; tab_stops tabs; while (has_arg()) { - if (tok.ch() == TAB_REPEAT_CHAR) { + int c = tok.ch(); // safely compares to char literals; TODO: grochar + if (c == TAB_REPEAT_CHAR) { tok.next(); is_repeating_stop = true; prev_pos = 0; @@ -3017,15 +3019,15 @@ static void configure_tab_stops_request() if (!get_hunits(&pos, 'm', prev_pos)) break; tab_type type = TAB_LEFT; - if (tok.ch() == 'C') { + if (c == int('C')) { // TODO: grochar tok.next(); type = TAB_CENTER; } - else if (tok.ch() == 'R') { + else if (c == int('R')) { // TODO: grochar tok.next(); type = TAB_RIGHT; } - else if (tok.ch() == 'L') { + else if (c == int('L')) { // TODO: grochar tok.next(); } if (pos <= prev_pos && ((!is_first_stop) || is_repeating_stop)) diff --git a/src/roff/troff/input.cpp b/src/roff/troff/input.cpp index 7e139f83d..eca239e8a 100644 --- a/src/roff/troff/input.cpp +++ b/src/roff/troff/input.cpp @@ -1786,7 +1786,7 @@ static const char *do_name_test() // \A if (tok == start_token && (want_att_compat || input_stack::get_level() == start_level)) break; - if (!tok.ch()) + if (tok.ch() == 0U) got_bad_char = true; got_some_char = true; } @@ -3106,9 +3106,9 @@ symbol read_identifier(bool required) if (want_att_compat) { char buf[3]; tok.skip_spaces(); - if ((buf[0] = tok.ch()) != 0) { + if ((buf[0] = (tok.ch()) != 0U)) { tok.next(); - if ((buf[1] = tok.ch()) != 0) { + if ((buf[1] = (tok.ch()) != 0U)) { buf[2] = 0; tok.make_space(); } @@ -3161,7 +3161,7 @@ static symbol do_get_long_name(bool required, char end_char) buf_size = new_buf_size; delete[] old_buf; } - if ((buf[i] = tok.ch()) == '\0' || (buf[i] == end_char)) + if ((buf[i] = ((tok.ch()) == 0U) || (buf[i] == end_char)) break; i++; tok.next(); @@ -3243,7 +3243,7 @@ void exit_request() void return_macro_request() { - if (has_arg() && tok.ch()) + if (has_arg() && (tok.ch() != 0U)) input_stack::pop_macro(); input_stack::pop_macro(); tok.next(); @@ -6015,31 +6015,31 @@ static bool get_line_arg(units *n, unsigned char si, charinfo **cip) static bool read_size(int *x) // \s { tok.next(); - int c = tok.ch(); + int c = tok.ch(); // safely compares to char literals; TODO: grochar int inc = 0; - if (c == '-') { + if (c == int('-')) { // TODO: grochar inc = -1; tok.next(); c = tok.ch(); } - else if (c == '+') { + else if (c == int('+')) { // TODO: grochar inc = 1; tok.next(); c = tok.ch(); } int val = 0; // pacify compiler bool contains_invalid_digit = false; - if (c == '(') { + if (c == int('(')) { // TODO: grochar tok.next(); c = tok.ch(); if (!inc) { // allow an increment either before or after the left parenthesis - if (c == '-') { + if (c == int('-')) { // TODO: grochar inc = -1; tok.next(); c = tok.ch(); } - else if (c == '+') { + else if (c == int('+')) { // TODO: grochar inc = 1; tok.next(); c = tok.ch(); @@ -6054,7 +6054,7 @@ static bool read_size(int *x) // \s if (!csdigit(c)) contains_invalid_digit = true; else { - val = val*10 + (c - '0'); + val = val * 10 + (c - '0'); val *= sizescale; } } @@ -6068,7 +6068,7 @@ static bool read_size(int *x) // \s if (!csdigit(c)) contains_invalid_digit = true; else { - val = val*10 + (c - '0'); + val = val * 10 + (c - '0'); error("ambiguous type size in escape sequence; rewrite to use" " '%1s(%2' or similar", static_cast<char>(escape_char), val); @@ -6102,8 +6102,9 @@ static bool read_size(int *x) // \s } if (!read_measurement(&val, 'z')) return false; - if (!(start.ch() == '[' && tok.ch() == ']') && start != tok) { - if (start.ch() == '[') + int s = tok.ch(); // safely compares to char literals; TODO: grochar + if (!(s == int('[') && s == int(']')) && start != tok) { + if (s == int('[')) error("missing ']' in type size escape sequence"); else error("missing closing delimiter in type size escape sequence"); @@ -6192,7 +6193,7 @@ static symbol get_delimited_name() && (want_att_compat || (input_stack::get_level() == start_level))) break; - if ((buf[i] = tok.ch()) == '\0') { + if ((buf[i] = (tok.ch()) == 0U) { // token::description() writes to static, class-wide storage, so // we must allocate a copy of it before issuing the next // diagnostic. @@ -6621,7 +6622,7 @@ static node *do_device_extension() // \X if (tok == start_token && (want_att_compat || input_stack::get_level() == start_level)) break; - unsigned char c; + unsigned char c; // TODO: grochar if (tok.is_space()) c = ' '; // TODO: Stop silently ignoring these when we have a string @@ -7012,21 +7013,21 @@ static bool is_conditional_expression_true() bool perform_output_comparison = false; bool want_test_sense_inverted = false; tok.skip_spaces(); - while (tok.ch() == '!') { + int c = tok.ch(); // safely compares to char literals; TODO: grochar + while (c == int('!')) { // TODO: grochar tok.next(); want_test_sense_inverted = !want_test_sense_inverted; } bool result; - unsigned char c = tok.ch(); if (want_att_compat) switch (c) { - case 'F': - case 'S': - case 'c': - case 'd': - case 'm': - case 'r': - case 'v': + case int('F'): // TODO: grochar + case int('S'): // TODO: grochar + case int('c'): // TODO: grochar + case int('d'): // TODO: grochar + case int('m'): // TODO: grochar + case int('r'): // TODO: grochar + case int('v'): // TODO: grochar warning(WARN_SYNTAX, "conditional expression operator '%1' is not portable to" " AT&T troff", @@ -7036,25 +7037,25 @@ static bool is_conditional_expression_true() default: break; } - if (c == 't') { + if (c == int('t')) { // TODO: grochar tok.next(); result = !in_nroff_mode; } - else if (c == 'n') { + else if (c == int('n')) { // TODO: grochar tok.next(); result = in_nroff_mode; } - else if (c == 'o') { - result = (topdiv->get_page_number() & 1); + else if (c == int('o')) { // TODO: grochar + result = (topdiv->get_page_number() & 1); // TODO: dump cleverness tok.next(); } - else if (c == 'e') { - result = !(topdiv->get_page_number() & 1); + else if (c == int('e')) { // TODO: grochar + result = !(topdiv->get_page_number() & 1); // TODO: dump cleverness tok.next(); } // TODO: else if (!want_att_compat) { // Check for GNU troff extended conditional expression operators. - else if ((c == 'd') || (c == 'r')) { + else if ((c == int('d') || (c == int('r')))) { // TODO: grochar tok.next(); symbol nm = read_identifier(true /* required */); if (nm.is_null()) { @@ -8302,7 +8303,7 @@ void warnscale_request() skip_line(); return; } - char c = tok.ch(); + int c = tok.ch(); // safely compares to char literals; TODO: grochar if (c == 'u') warn_scale = 1.0; else if (c == 'i') @@ -8642,7 +8643,9 @@ static void define_class_request() // Chained range expressions like // \[u3041]-\[u3096]-\[u30FF] // are not valid. - if ((child1 != 0 /* nullptr */) && (tok.ch() == '-')) { + // Promote `tok.ch()` to an int to safely compare to char literal. + // TODO: cast '-' to grochar + if ((child1 != 0 /* nullptr */) && (int(tok.ch()) == '-')) { tok.next(); child2 = tok.get_charinfo(); if (0 /* nullptr */ == child2) { @@ -10205,7 +10208,8 @@ static node *read_drawing_command() // \D warning(WARN_MISSING, "missing arguments to drawing escape" " sequence"); else { - unsigned char type = tok.ch(); + int type = tok.ch(); // safely compares to char literals + // TODO: grochar if (type == 'F') { read_drawing_command_color_arguments(start_token); return 0 /* nullptr */; @@ -10321,10 +10325,12 @@ static void read_drawing_command_color_arguments(token &start) error("missing color scheme"); return; } - unsigned char scheme = tok.ch(); + // safely compares to char literals; TODO: grochar + int scheme = tok.ch(); tok.next(); color *col = 0 /* nullptr */; - char end = start.ch(); + // TODO: grochar + int end = start.ch(); switch (scheme) { case 'c': col = read_cmy(end); diff --git a/src/roff/troff/number.cpp b/src/roff/troff/number.cpp index a55c24eef..16a685f75 100644 --- a/src/roff/troff/number.cpp +++ b/src/roff/troff/number.cpp @@ -223,11 +223,11 @@ static incr_number_result get_incr_number(units *res, unsigned char si) if (!is_valid_expression_start()) return INVALID; incr_number_result result = ASSIGN; - if (tok.ch() == '+') { + if (tok.ch() == int('+')) { // TODO: grochar tok.next(); result = INCREMENT; } - else if (tok.ch() == '-') { + else if (tok.ch() == int('-')) { // TODO: grochar tok.next(); result = DECREMENT; } @@ -264,7 +264,7 @@ static bool is_valid_expression(units *u, int scaling_unit, while (result) { if (is_parenthesized) tok.skip_spaces(); - int op = tok.ch(); + int op = tok.ch();// safely compares to char literals; TODO: grochar switch (op) { case '+': case '-': @@ -277,29 +277,29 @@ static bool is_valid_expression(units *u, int scaling_unit, break; case '>': tok.next(); - if (tok.ch() == '=') { + if (tok.ch() == int('=')) { // TODO: grochar tok.next(); op = OP_GEQ; } - else if (tok.ch() == '?') { + else if (tok.ch() == int('?')) { // TODO: grochar tok.next(); op = OP_MAX; } break; case '<': tok.next(); - if (tok.ch() == '=') { + if (tok.ch() == int('=')) { // TODO: grochar tok.next(); op = OP_LEQ; } - else if (tok.ch() == '?') { + else if (tok.ch() == int('?')) { // TODO: grochar tok.next(); op = OP_MIN; } break; case '=': tok.next(); - if (tok.ch() == '=') + if (tok.ch() == int('=')) // TODO: grochar tok.next(); break; default: @@ -387,15 +387,15 @@ static bool is_valid_term(units *u, int scaling_unit, for (;;) if (is_parenthesized && tok.is_space()) tok.next(); - else if (tok.ch() == '+') + else if (tok.ch() == int('+')) // TODO: grochar tok.next(); - else if (tok.ch() == '-') { + else if (tok.ch() == int('-')) { // TODO: grochar tok.next(); is_negative = !is_negative; } else break; - unsigned char c = tok.ch(); + int c = tok.ch(); // safely compares to char literals; TODO: grochar switch (c) { case '|': // | is not restricted to the outermost level @@ -418,7 +418,7 @@ static bool is_valid_term(units *u, int scaling_unit, case '(': tok.next(); c = tok.ch(); - if (c == ')') { + if (c == ')') { // TODO: grochar if (is_mandatory) return false; warning(WARN_SYNTAX, "empty parentheses"); @@ -428,7 +428,7 @@ static bool is_valid_term(units *u, int scaling_unit, } else if (c != 0 && strchr(SCALING_UNITS, c) != 0) { tok.next(); - if (tok.ch() == ';') { + if (tok.ch() == ';') { // TODO: grochar tok.next(); scaling_unit = c; } @@ -446,7 +446,7 @@ static bool is_valid_term(units *u, int scaling_unit, true /* is_parenthesized */, is_mandatory)) return false; tok.skip_spaces(); - if (tok.ch() != ')') { + if (tok.ch() != ')') { // TODO: grochar if (is_mandatory) return false; warning(WARN_SYNTAX, "expected ')', got %1", tok.description()); @@ -507,7 +507,7 @@ static bool is_valid_term(units *u, int scaling_unit, return false; } int divisor = 1; - if (tok.ch() == '.') { + if (tok.ch() == int('.')) { // TODO: grochar tok.next(); for (;;) { c = tok.ch(); diff --git a/src/roff/troff/reg.cpp b/src/roff/troff/reg.cpp index 1314ba23e..de010981a 100644 --- a/src/roff/troff/reg.cpp +++ b/src/roff/troff/reg.cpp @@ -447,7 +447,7 @@ static void assign_register_format_request() register_dictionary.define(nm, r); } tok.skip_spaces(); - char c = tok.ch(); + int c = tok.ch(); // safely compares to char literals; TODO: grochar if (csdigit(c)) { int n = 0; do { @@ -456,7 +456,10 @@ static void assign_register_format_request() } while (csdigit(tok.ch())); r->alter_format('1', n); } - else if (c == 'i' || c == 'I' || c == 'a' || c == 'A') + else if ((c == int('i')) + || (c == int('I')) + || (c == int('a')) + || (c == int('A'))) // TODO: grochar * 4 r->alter_format(c); else if (!has_arg()) warning(WARN_MISSING, "register interpolation format assignment" >> to prevent the Savannah web UI from mistaking my asterisks for >> emphasis markup > > It's clunky, but savannah's +nomarkup++-nomarkup-nomarkup+ tag lets you > +nomarkup+char * char * char *-nomarkup- all day long. Never knew about that! I'll try it on the foregoing... ...hmmm, didn't quite work (see the second "handler" hyperlink, which ends in the wrong place with these tags and doesn't get formatted at all with nomarkup tags within the link text). I seem to prone to instinctively breaking context-sensitive languages. _______________________________________________________ Reply to this item at: <https://savannah.gnu.org/bugs/?40720> _______________________________________________ Message sent via Savannah https://savannah.gnu.org/
signature.asc
Description: PGP signature
