[PATCH v2] [eqn]: Use XML character references in MathML output.

Nguyễn Gia Phong via discussion of the GNU roff typesetting system and related software Wed, 25 Mar 2026 21:58:14 -0700

XML only defines four entities (< > & ") out of the box,
others need to be declared in the document's DOCTYPE.
For web feeds such as RSS and Atom, is is particularly cumbersome
to define the math entities as these feeds are supposed
to be stand-alone and thus the entity definitions have to be inlined.


Therefore, character references are now used
instead of entity references, making the MathML output
directly embeddable into these feeds.  The entity table
is no longer used and thus removed.

* src/preproc/eqn/text.cpp: Remove struct map, entity_table,
  and special_to_entity.  Include "unicode.h" header file.
  (special_char_box::output): Instead of named entity reference,
  print XML character reference with Unicode codepoint for MathML.
  Add support for Unicode code sequence as an input character.

References: https://www.w3.org/TR/REC-xml/#sec-references
---
 src/preproc/eqn/text.cpp | 386 +--------------------------------------
 1 file changed, 6 insertions(+), 380 deletions(-)

diff --git a/src/preproc/eqn/text.cpp b/src/preproc/eqn/text.cpp
index 19b2e8f684ab..284ff1ff1101 100644
--- a/src/preproc/eqn/text.cpp
+++ b/src/preproc/eqn/text.cpp
@@ -27,383 +27,7 @@ along with this program.  If not, see 
<http://www.gnu.org/licenses/>. */
 #include "eqn.h"
 #include "pbox.h"
 #include "ptable.h"
-
-struct map {
-  const char *from;
-  const char *to;
-};
-
-struct map entity_table[] = {
-  // Classic troff special characters
-  {"%", "&shy;"},      // ISOnum
-  {"'", "&acute;"},    // ISOdia
-  {"!=", "&ne;"},      // ISOtech
-  {"**", "&lowast;"},  // ISOtech
-  {"*a", "&alpha;"},   // ISOgrk3
-  {"*A", "A"},
-  {"*b", "&beta;"},    // ISOgrk3
-  {"*B", "B"},
-  {"*d", "&delta;"},   // ISOgrk3
-  {"*D", "&Delta;"},   // ISOgrk3
-  {"*e", "&epsilon;"}, // ISOgrk3
-  {"*E", "E"},
-  {"*f", "&phi;"},     // ISOgrk3
-  {"*F", "&Phi;"},     // ISOgrk3
-  {"*g", "&gamma;"},   // ISOgrk3
-  {"*G", "&Gamma;"},   // ISOgrk3
-  {"*h", "&theta;"},   // ISOgrk3
-  {"*H", "&Theta;"},   // ISOgrk3
-  {"*i", "&iota;"},    // ISOgrk3
-  {"*I", "I"},
-  {"*k", "&kappa;"},   // ISOgrk3
-  {"*K", "K;"},
-  {"*l", "&lambda;"},  // ISOgrk3
-  {"*L", "&Lambda;"},  // ISOgrk3
-  {"*m", "&mu;"},      // ISOgrk3
-  {"*M", "M"},
-  {"*n", "&nu;"},      // ISOgrk3
-  {"*N", "N"},
-  {"*o", "o"},
-  {"*O", "O"},
-  {"*p", "&pi;"},      // ISOgrk3
-  {"*P", "&Pi;"},      // ISOgrk3
-  {"*q", "&psi;"},     // ISOgrk3
-  {"*Q", "&PSI;"},     // ISOgrk3
-  {"*r", "&rho;"},     // ISOgrk3
-  {"*R", "R"},
-  {"*s", "&sigma;"},   // ISOgrk3
-  {"*S", "&Sigma;"},   // ISOgrk3
-  {"*t", "&tau;"},     // ISOgrk3
-  {"*T", "&Tau;"},     // ISOgrk3
-  {"*u", "&upsilon;"}, // ISOgrk3
-  {"*U", "&Upsilon;"}, // ISOgrk3
-  {"*w", "&omega;"},   // ISOgrk3
-  {"*W", "&Omega;"},   // ISOgrk3
-  {"*x", "&chi;"},     // ISOgrk3
-  {"*X", "&Chi;"},     // ISOgrk3
-  {"*y", "&eta;"},     // ISOgrk3
-  {"*Y", "&Eta;"},     // ISOgrk3
-  {"*z", "&zeta;"},    // ISOgrk3
-  {"*Z", "&Zeta;"},    // ISOgrk3
-  {"+-", "&plusmn;"},  // ISOnum
-  {"->", "&rarr;"},    // ISOnum
-  {"12", "&frac12;"},  // ISOnum
-  {"14", "&frac14;"},  // ISOnum
-  {"34", "&frac34;"},  // ISOnum
-  {"<-", "&larr;"},    // ISOnum
-  {"==", "&equiv;"},   // ISOtech
-  {"Fi", "&ffilig;"},  // ISOpub
-  {"Fl", "&ffllig;"},  // ISOpub
-  {"aa", "&acute;"},   // ISOdia
-  {"ap", "&sim;"},     // ISOtech
-  {"bl", "&phonexb;"}, // ISOpub
-  {"br", "&boxv;"},    // ISObox
-  {"bs", "&phone;"},   // ISOpub (for the Bell logo)
-  {"bu", "&bull;"},    // ISOpub
-  {"bv", "&verbar;"},  // ISOnum
-  {"ca", "&cap;"},     // ISOtech
-  {"ci", "&cir;"},     // ISOpub
-  {"co", "&copy;"},    // ISOnum
-  {"ct", "&cent;"},    // ISOnum
-  {"cu", "&cup;"},     // ISOtech
-  {"da", "&darr;"},    // ISOnum
-  {"de", "&deg;"},     // ISOnum
-  {"dg", "&dagger;"},  // ISOpub
-  {"dd", "&Dagger;"},  // ISOpub
-  {"di", "&divide;"},  // ISOnum
-  {"em", "&mdash;"},   // ISOpub
-  {"eq", "&equals;"},  // ISOnum
-  {"es", "&empty;"},   // ISOamso
-  {"ff", "&fflig;"},   // ISOpub
-  {"fi", "&filig;"},   // ISOpub
-  {"fl", "&fllig;"},   // ISOpub
-  {"fm", "&prime;"},   // ISOtech
-  {"ge", "&ge;"},      // ISOtech
-  {"gr", "&nabla;"},   // ISOtech
-  {"hy", "&hyphen;"},  // ISOnum
-  {"ib", "&sube;"},    // ISOtech
-  {"if", "&infin;"},   // ISOtech
-  {"ip", "&supe;"},    // ISOtech
-  {"is", "&int;"},     // ISOtech
-  {"le", "&le;"},      // ISOtech
-  // Some pile characters go here
-  {"mi", "&minus;"},   // ISOtech
-  {"mo", "&isin;"},    // ISOtech
-  {"mu", "&times;"},   // ISOnum
-  {"no", "&not;"},     // ISOnum
-  {"or", "&verbar;"},  // ISOnum
-  {"pl", "&plus;"},    // ISOnum
-  {"pt", "&prop;"},    // ISOtech
-  {"rg", "&trade;"},   // ISOnum
-  // More pile characters go here
-  {"rn", "&macr;"},    // ISOdia
-  {"ru", "&lowbar;"},  // ISOnum
-  {"sb", "&sub;"},     // ISOtech
-  {"sc", "&sect;"},    // ISOnum
-  {"sl", "/"},
-  {"sp", "&sup;"},     // ISOtech
-  {"sq", "&squf;"},    // ISOpub
-  {"sr", "&radic;"},   // ISOtech
-  {"ts", "&sigmav;"},  // ISOgrk3
-  {"ua", "&uarr;"},    // ISOnum
-  {"ul", "_"},
-  {"~=", "&cong;"},    // ISOtech
-  // Extended specials supported by groff; see groff_char(7).
-  // These are listed in the order they occur on that man page.
-  {"-D", "&ETH;"},     // ISOlat: Icelandic uppercase eth
-  {"Sd", "&eth;"},     // ISOlat1: Icelandic lowercase eth
-  {"TP", "&THORN;"},   // ISOlat1: Icelandic uppercase thorn
-  {"Tp", "&thorn;"},   // ISOlat1: Icelandic lowercase thorn
-  {"ss", "&szlig;"},   // ISOlat1
-  // Ligatures
-  // ff, fi, fl, ffi, ffl from old troff go here
-  {"AE", "&AElig;"},   // ISOlat1
-  {"ae", "&aelig;"},   // ISOlat1
-  {"OE", "&OElig;"},   // ISOlat2
-  {"oe", "&oelig;"},   // ISOlat2
-  {"IJ", "&ijlig;"},   // ISOlat2: Dutch IJ ligature
-  {"ij", "&IJlig;"},   // ISOlat2: Dutch ij ligature
-  {".i", "&inodot;"},  // ISOlat2,ISOamso
-  {".j", "&jnodot;"},  // ISOamso (undocumented but in 1.19)
-  // Accented characters
-  {"'A", "&Aacute;"},  // ISOlat1
-  {"'C", "&Cacute;"},  // ISOlat2
-  {"'E", "&Eacute;"},  // ISOlat1
-  {"'I", "&Iacute;"},  // ISOlat1
-  {"'O", "&Oacute;"},  // ISOlat1
-  {"'U", "&Uacute;"},  // ISOlat1
-  {"'Y", "&Yacute;"},  // ISOlat1
-  {"'a", "&aacute;"},  // ISOlat1
-  {"'c", "&cacute;"},  // ISOlat2
-  {"'e", "&eacute;"},  // ISOlat1
-  {"'i", "&iacute;"},  // ISOlat1
-  {"'o", "&oacute;"},  // ISOlat1
-  {"'u", "&uacute;"},  // ISOlat1
-  {"'y", "&yacute;"},  // ISOlat1
-  {":A", "&Auml;"},    // ISOlat1
-  {":E", "&Euml;"},    // ISOlat1
-  {":I", "&Iuml;"},    // ISOlat1
-  {":O", "&Ouml;"},    // ISOlat1
-  {":U", "&Uuml;"},    // ISOlat1
-  {":Y", "&Yuml;"},    // ISOlat2
-  {":a", "&auml;"},    // ISOlat1
-  {":e", "&euml;"},    // ISOlat1
-  {":i", "&iuml;"},    // ISOlat1
-  {":o", "&ouml;"},    // ISOlat1
-  {":u", "&uuml;"},    // ISOlat1
-  {":y", "&yuml;"},    // ISOlat1
-  {"^A", "&Acirc;"},   // ISOlat1
-  {"^E", "&Ecirc;"},   // ISOlat1
-  {"^I", "&Icirc;"},   // ISOlat1
-  {"^O", "&Ocirc;"},   // ISOlat1
-  {"^U", "&Ucirc;"},   // ISOlat1
-  {"^a", "&acirc;"},   // ISOlat1
-  {"^e", "&ecirc;"},   // ISOlat1
-  {"^i", "&icirc;"},   // ISOlat1
-  {"^o", "&ocirc;"},   // ISOlat1
-  {"^u", "&ucirc;"},   // ISOlat1
-  {"`A", "&Agrave;"},  // ISOlat1
-  {"`E", "&Egrave;"},  // ISOlat1
-  {"`I", "&Igrave;"},  // ISOlat1
-  {"`O", "&Ograve;"},  // ISOlat1
-  {"`U", "&Ugrave;"},  // ISOlat1
-  {"`a", "&agrave;"},  // ISOlat1
-  {"`e", "&egrave;"},  // ISOlat1
-  {"`i", "&igrave;"},  // ISOlat1
-  {"`o", "&ograve;"},  // ISOlat1
-  {"`u", "&ugrave;"},  // ISOlat1
-  {"~A", "&Atilde;"},  // ISOlat1
-  {"~N", "&Ntilde;"},  // ISOlat1
-  {"~O", "&Otilde;"},  // ISOlat1
-  {"~a", "&atilde;"},  // ISOlat1
-  {"~n", "&ntilde;"},  // ISOlat1
-  {"~o", "&otilde;"},  // ISOlat1
-  {"vS", "&Scaron;"},  // ISOlat2
-  {"vs", "&scaron;"},  // ISOlat2
-  {"vZ", "&Zcaron;"},  // ISOlat2
-  {"vz", "&zcaron;"},  // ISOlat2
-  {",C", "&Ccedil;"},  // ISOlat1
-  {",c", "&ccedil;"},  // ISOlat1
-  {"/L", "&Lstrok;"},  // ISOlat2: Polish L with a slash
-  {"/l", "&lstrok;"},  // ISOlat2: Polish l with a slash
-  {"/O", "&Oslash;"},  // ISOlat1
-  {"/o", "&oslash;"},  // ISOlat1
-  {"oA", "&Aring;"},   // ISOlat1
-  {"oa", "&aring;"},   // ISOlat1
-  // Accents
-  {"a\"","&dblac;"},   // ISOdia: double acute accent (Hungarian umlaut)
-  {"a-", "&macr;"},    // ISOdia: macron or bar accent
-  {"a.", "&dot;"},     // ISOdia: dot above
-  {"a^", "&circ;"},    // ISOdia: circumflex accent
-  {"aa", "&acute;"},   // ISOdia: acute accent
-  {"ga", "&grave;"},   // ISOdia: grave accent
-  {"ab", "&breve;"},   // ISOdia: breve accent
-  {"ac", "&cedil;"},   // ISOdia: cedilla accent
-  {"ad", "&uml;"},     // ISOdia: umlaut or dieresis
-  {"ah", "&caron;"},   // ISOdia: caron (aka hacek accent)
-  {"ao", "&ring;"},    // ISOdia: ring or circle accent
-  {"a~", "&tilde;"},   // ISOdia: tilde accent
-  {"ho", "&ogon;"},    // ISOdia: hook or ogonek accent
-  {"ha", "^"},         // ASCII circumflex, hat, caret
-  {"ti", "~"},         // ASCII tilde, large tilde
-  // Quotes
-  {"Bq", "&lsquor;"},  // ISOpub: low double comma quote
-  {"bq", "&ldquor;"},  // ISOpub: low single comma quote
-  {"lq", "&ldquo;"},   // ISOnum
-  {"rq", "&rdquo;"},   // ISOpub
-  {"oq", "&lsquo;"},   // ISOnum: single open quote
-  {"cq", "&rsquo;"},   // ISOnum: single closing quote (ASCII 39)
-  {"aq", "&zerosp;'"}, // apostrophe quote
-  {"dq", "\""},                // double quote (ASCII 34)
-  {"Fo", "&laquo;"},   // ISOnum
-  {"Fc", "&raquo;"},   // ISOnum
-  //{"fo", "&fo;"},
-  //{"fc", "&fc;"},
-  // Punctuation
-  {"r!", "&iexcl;"},   // ISOnum
-  {"r?", "&iquest;"},  // ISOnum
-  // Old troff \(em goes here
-  {"en", "&ndash;"},   // ISOpub: en dash
-  // Old troff \(hy goes here 
-  // Brackets
-  {"lB", "&lsqb;"},    // ISOnum: left (square) bracket
-  {"rB", "&rsqb;"},    // ISOnum: right (square) bracket
-  {"lC", "&lcub;"},    // ISOnum: left (curly) brace
-  {"rC", "&rcub;"},    // ISOnum: right (curly) brace
-  {"la", "&lang;"},    // ISOtech: left angle bracket
-  {"ra", "&rang;"},    // ISOtech: right angle bracket
-  // Old troff \(bv goes here
-  // Bracket-pile characters could go here.
-  // Arrows
-  // Old troff \(<- and \(-> go here
-  {"<>", "&harr;"},    // ISOamsa
-  {"da", "&darr;"},    // ISOnum
-  {"ua", "&uarr;"},    // ISOnum
-  {"lA", "&lArr;"},    // ISOtech
-  {"rA", "&rArr;"},    // ISOtech
-  {"hA", "&iff;"},     // ISOtech: horizontal double-headed arrow
-  {"dA", "&dArr;"},    // ISOamsa
-  {"uA", "&uArr;"},    // ISOamsa
-  {"vA", "&vArr;"},    // ISOamsa: vertical double-headed double arrow
-  //{"an", "&an;"},
-  // Lines
-  {"-h", "&planck;"},  // ISOamso: h-bar (Planck's constant)
-  // Old troff \(or goes here
-  {"ba", "&verbar;"},  // ISOnum
-  // Old troff \(br, \{u, \(ul, \(bv go here
-  {"bb", "&brvbar;"},  // ISOnum
-  {"sl", "/"},
-  {"rs", "&bsol;"},    // ISOnum
-  // Text markers
-  // Old troff \(ci, \(bu, \(dd, \(dg go here
-  {"lz", "&loz;"},     // ISOpub
-  // Old troff sq goes here
-  {"ps", "&para;"},    // ISOnum: paragraph or pilcrow sign
-  {"sc", "&sect;"},    // ISOnum (in old troff)
-  // Old troff \(lh, \{h go here
-  {"at", "&commat;"},  // ISOnum
-  {"sh", "&num;"},     // ISOnum
-  //{"CR", "&CR;"},
-  {"OK", "&check;"},   // ISOpub
-  // Legalize
-  // Old troff \(co, \{g go here
-  {"tm", "&trade;"},   // ISOnum
-  // Currency symbols
-  {"Do", "&dollar;"},  // ISOnum
-  {"ct", "&cent;"},    // ISOnum
-  {"eu", "&euro;"},
-  {"Eu", "&euro;"},
-  {"Ye", "&yen;"},     // ISOnum
-  {"Po", "&pound;"},   // ISOnum
-  {"Cs", "&curren;"},  // ISOnum: currency sign
-  {"Fn", "&fnof"},     // ISOtech
-  // Units
-  // Old troff de goes here
-  {"%0", "&permil;"},  // ISOtech: per thousand, per mille sign
-  // Old troff \(fm goes here
-  {"sd", "&Prime;"},   // ISOtech
-  {"mc", "&micro;"},   // ISOnum
-  {"Of", "&ordf;"},    // ISOnum
-  {"Om", "&ordm;"},    // ISOnum
-  // Logical symbols
-  {"AN", "&and;"},     // ISOtech
-  {"OR", "&or;"},      // ISOtech
-  // Old troff \(no goes here
-  {"te", "&exist;"},   // ISOtech: there exists, existential quantifier
-  {"fa", "&forall;"},  // ISOtech: for all, universal quantifier
-  {"st", "&bepsi"},    // ISOamsr: such that
-  {"3d", "&there4;"},  // ISOtech
-  {"tf", "&there4;"},  // ISOtech
-  // Mathematical symbols
-  // Old troff "12", "14", "34" goes here
-  {"S1", "&sup1;"},    // ISOnum
-  {"S2", "&sup2;"},    // ISOnum
-  {"S3", "&sup3;"},    // ISOnum
-  // Old troff \(pl", \-, \(+- go here
-  {"t+-", "&plusmn;"}, // ISOnum
-  {"-+", "&mnplus;"},  // ISOtech
-  {"pc", "&middot;"},  // ISOnum
-  {"md", "&middot;"},  // ISOnum
-  // Old troff \(mu goes here
-  {"tmu", "&times;"},  // ISOnum
-  {"c*", "&otimes;"},  // ISOamsb: multiply sign in a circle
-  {"c+", "&oplus;"},   // ISOamsb: plus sign in a circle
-  // Old troff \(di goes here
-  {"tdi", "&divide;"}, // ISOnum
-  {"f/", "&horbar;"},  // ISOnum: horizontal bar for fractions
-  // Old troff \(** goes here
-  {"<=", "&le;"},      // ISOtech
-  {">=", "&ge;"},      // ISOtech
-  {"<<", "&Lt;"},      // ISOamsr
-  {">>", "&Gt;"},      // ISOamsr
-  {"!=", "&ne;"},      // ISOtech
-  // Old troff \(eq and \(== go here
-  {"=~", "&cong;"},    // ISOamsr
-  // Old troff \(ap goes here
-  {"~~", "&ap;"},      // ISOtech
-  // This appears to be an error in the groff table.  
-  // It clashes with the Bell Labs use of ~= for a congruence sign
-  // {"~=", "&ap;"},   // ISOamsr
-  // Old troff \(pt, \(es, \(mo go here
-  {"nm", "&notin;"},   // ISOtech
-  {"nb", "&nsub;"},    // ISOamsr
-  {"nc", "&nsup;"},    // ISOamsn
-  {"ne", "&nequiv;"},  // ISOamsn
-  // Old troff \(sb, \(sp, \(ib, \(ip, \(ca, \(cu go here
-  {"/_", "&ang;"},     // ISOamso
-  {"pp", "&perp;"},    // ISOtech
-  // Old troff \(is goes here
-  {"sum", "&sum;"},    // ISOamsb
-  {"product", "&prod;"},       // ISOamsb
-  {"gr", "&nabla;"},   // ISOtech
-  // Old troff \(sr. \{n, \(if go here
-  {"Ah", "&aleph;"},   // ISOtech
-  {"Im", "&image;"},   // ISOamso: Fraktur I, imaginary
-  {"Re", "&real;"},    // ISOamso: Fraktur R, real
-  {"wp", "&weierp;"},  // ISOamso
-  {"pd", "&part;"},    // ISOtech: partial differentiation sign
-  // Their table duplicates the Greek letters here.
-  // We list only the variant forms here, mapping them into
-  // the ISO Greek 4 variants (which may or may not be correct :-() 
-  {"+f", "&b.phiv;"},  // ISOgrk4: variant phi
-  {"+h", "&b.thetas;"},        // ISOgrk4: variant theta
-  {"+p", "&b.omega;"}, // ISOgrk4: variant pi, looking like omega
-  // Card symbols
-  {"CL", "&clubs;"},   // ISOpub: club suit
-  {"SP", "&spades;"},  // ISOpub: spade suit
-  {"HE", "&hearts;"},  // ISOpub: heart suit
-  {"DI", "&diams;"},   // ISOpub: diamond suit
-};
-
-const char *special_to_entity(const char *sp)
-{
-  struct map *mp;
-  for (mp = entity_table; mp < entity_table + countof(entity_table); mp++) {
-    if (strcmp(mp->from, sp) == 0)
-      return mp->to;
-  }
-  return NULL;
-}
+#include "unicode.h"
 
 class char_box : public simple_box {
   unsigned char c;
@@ -665,9 +289,11 @@ void special_char_box::output()
       printf("\\fP");
   }
   else if (output_format == mathml) {
-    const char *entity = special_to_entity(s);
-    if (entity != NULL)
-      printf("<mo>%s</mo>", entity);
+    const char *unicode_code_point = valid_unicode_code_sequence(s);
+    if (unicode_code_point == NULL)
+      unicode_code_point = glyph_name_to_unicode(s);
+    if (unicode_code_point != NULL)
+      printf("<mo>&#x%s;</mo>", unicode_code_point);
     else
       printf("<merror>unknown eqn/troff special char %s</merror>", s);
   }
-- 
2.52.0

[PATCH v2] [eqn]: Use XML character references in MathML output.

Reply via email to