Hello,
here is a revision of the earlier patch to allow wv to output
iso-8859-1 characters. A big "thank you!" to Joaquim Cuenca
Abela for pointing out the problem and forcing me to read some
background documentation ;-)
I believe the current patch is correct, almost complete
and "politically correct". And it shrinks the code ;-)
1. The LaTeX output now uses the latin1 (iso-8859-1) encoding,
which is the best choice until a Unicode/UTF-8 enabled
LaTeX comes along. Supported are standard iso-8859-1
("Latin-1") 8-bit codes for chars 159-255; this is done
by simple hand-through as these codes are already in the
char16 variable. So sequences like \"a have vanished.
(I assume this arrangement makes also implementing
support for Latin-x easier.)
2. A number of Windows extended chars often found in
Windows documents were added by hand ("Latin1-Win"),
also on the Html side. A few had already been done by
Caolan.
3. The Html output now uses consistently Unicode/UTF-8.
The block that output German characters like a-umlaut as
ä has been removed, as these are not necessary if
the doc is marked as being UTF-8 encoded.
About ß I'm not sure. Left it there.
Please let me know if this is the way to go. The patch is
against the current CVS.
Martin
--
Martin Vermeer [EMAIL PROTECTED] [EMAIL PROTECTED]
-- Linux means: never having to delete your love letters
Index: text.c
===================================================================
RCS file: /cvsroot/wv/text.c,v
retrieving revision 1.38
diff -u -r1.38 text.c
--- text.c 2000/07/01 15:48:01 1.38
+++ text.c 2000/07/04 08:42:37
@@ -557,13 +557,13 @@
printf("\\%%");
return(1);
case 11:
- printf("newline\n");
+ printf("\\\\\n");
return(1);
case 30:
case 31:
case 45:
case 0x2013:
- printf("-");
+ printf("--"); /* en-dash */
return(1);
case 12:
case 13:
@@ -584,220 +584,81 @@
return(1);
/*
- german and scandinavian characters, MV 1.7.2000
- See man iso_8859_1
- */
- case 0xc4:
- printf("\\\"A");
- return(1);
- case 0xe4:
- printf("\\\"a");
- return(1);
- case 0xdc:
- printf("\\\"U");
- return(1);
- case 0xfc:
- printf("\\\"u");
- return(1);
- case 0xd6:
- printf("\\\"O");
- return(1);
- case 0xf6:
- printf("\\\"o");
- return(1);
- case 0xdf:
- printf("\\ss{}"); /* German ss */
- return(1);
- case 0xc5:
- printf("\\AA{}");
- return(1);
- case 0xe5:
- printf("\\aa{}");
- return(1);
- case 0xc6:
- printf("\\AE{}");
- return(1);
- case 0xe6:
- printf("\\ae{}");
- return(1);
- case 0xd8:
- printf("\\O{}"); /* Danish O-slash */
- return(1);
- case 0xf8:
- printf("\\o{}");
- return(1);
+ german and scandinavian characters, MV 1.7.2000
+ See man iso_8859_1
- /* Some more iso-8859-1, not properly tested */
+ This requires the inputencoding latin1 package,
+ see latin1.def. Chars in range 159...255 are just
+ put through as these are legal iso-8859-1 symbols.
+ -- MV 4.7.2000
+ */
- case 0xb1:
- printf("\\pm{}"); /* plusminus */
- return(1);
- case 0xab:
- printf("\\flqq{}"); /* french quotes << */
- return(1);
- case 0xbb:
- printf("\\frqq{}"); /* french quotes >> */
- return(1);
- case 0xa1:
- printf("!`"); /* Spanish ! */
- return(1);
- case 0xbf:
- printf("?`"); /* Spanish ? */
- return(1);
- case 0xc0:
- printf("\\`A");
- return(1);
- case 0xc1:
- printf("\\'A");
- return(1);
- case 0xc2:
- printf("\\^A");
+ case 0x9f ... 0xff:
+ printf("%c", char16);
return(1);
- case 0xc3:
- printf("\\~A");
- return(1);
- case 0xc7:
- printf("\\c C"); /* C cedilla */
- return(1);
- case 0xc8:
- printf("\\`E");
- return(1);
- case 0xc9:
- printf("\\'E");
- return(1);
- case 0xca:
- printf("\\^E");
- return(1);
- case 0xcb:
- printf("\\\"E");
- return(1);
- case 0xcc:
- printf("\\`I");
- return(1);
- case 0xce:
- printf("\\^I");
- return(1);
- case 0xcf:
- printf("\\\"I");
- return(1);
- case 0xd1:
- printf("\\~N");
- return(1);
- case 0xd2:
- printf("\\`O");
- return(1);
- case 0xd3:
- printf("\\'O");
- return(1);
- case 0xd4:
- printf("\\^O");
- return(1);
- case 0xd5:
- printf("\\~O");
- return(1);
- case 0xd9:
- printf("\\`U");
- return(1);
- case 0xda:
- printf("\\'U");
- return(1);
- case 0xdb:
- printf("\\^U");
- return(1);
- case 0xdd:
- printf("\\'Y");
- return(1);
- case 0xe0:
- printf("\\`a");
- return(1);
- case 0xe1:
- printf("\\'a");
- return(1);
- case 0xe2:
- printf("\\^a");
- return(1);
- case 0xe3:
- printf("\\~a");
- return(1);
- case 0xe7:
- printf("\\c c");
- return(1);
- case 0xe8:
- printf("\\`e");
- return(1);
- case 0xe9:
- printf("\\'e");
- return(1);
- case 0xea:
- printf("\\^e");
+ case 0x2019:
+ printf("'"); /* Right single quote, Win */
return(1);
- case 0xeb:
- printf("\\\"e");
+ case 0x2215:
+ printf("/");
return(1);
- case 0xec:
- printf("\\`i");
+ case 0xF8E7:
+ /* without this, things should work in theory, but not for me */
+ printf("_");
return(1);
- case 0xed:
- printf("\\'i");
+ case 0x2018:
+ printf("`"); /* left single quote, Win */
return(1);
- case 0xee:
- printf("\\^i");
+
+ /* Windows specials (MV 4.7.2000). More could be added.
+ See http://www.hut.fi/u/jkorpela/www/windows-chars.html
+ */
+
+ case 0x0160:
+ printf("\\v S"); /* S-caron */
return(1);
- case 0xef:
- printf("\\\"i");
+ case 0x0161:
+ printf("\\v s"); /* s-caron */
return(1);
- case 0xf1:
- printf("\\~n");
+ case 0x2014:
+ printf("---"); /* em-dash */
return(1);
- case 0xf2:
- printf("\\`o");
+ case 0x201c:
+ printf("``"); /* inverted double quotation mark */
return(1);
- case 0xf3:
- printf("\\'o");
+ case 0x201d:
+ printf("''"); /* double q.m. */
return(1);
- case 0xf4:
- printf("\\^o");
+ case 0x2020:
+ printf("\\dagger");
return(1);
- case 0xf5:
- printf("\\~o");
+ case 0x2021:
+ printf("\\ddagger");
return(1);
- case 0xf9:
- printf("\\`u");
+ case 0x2022:
+ printf("\\bullet");
return(1);
- case 0xfa:
- printf("\\'u");
+ case 0x0152:
+ printf("\\OE{}"); /* OE ligature */
return(1);
- case 0xfb:
- printf("\\^u");
+ case 0x0153:
+ printf("\\oe{}"); /* oe ligature */
return(1);
- case 0xfd:
- printf("\\'y");
+ case 0x0178:
+ printf("\\\"Y;");
return(1);
- case 0xff:
- printf("\\\"y");
+ case 0x2030:
+ printf("o/oo");
return(1);
-
- case 0xf0:
- printf("?"); /* Icelandic eth? */
+ case 0x20ac:
+ printf("\\euro"); /* No known implementation ;-) */
return(1);
- /* End iso-8859-1 */
-
- case 0x2019:
- printf("'");
- return(1);
- case 0x2215:
- printf("/");
- return(1);
- case 0xF8E7: /* without this, things should work in theory, but not
for me */
- printf("_");
- return(1);
- case 0x2018:
- printf("`");
- return(1);
}
+ /* Debugging aid: */
+ if (char16 >= 0x80) printf("[%x]", char16);
return(0);
}
@@ -812,7 +673,7 @@
case 31:
case 45:
case 0x2013:
- printf("-");
+ printf("--"); /* en-dash */
return(1);
case 12:
case 13:
@@ -831,31 +692,18 @@
case 62:
printf(">");
return(1);
- /*
+ /*
german characters, im assured that this is the right way to handle them
by Markus Schulte <[EMAIL PROTECTED]>
- */
- case 0xc4:
- printf("Ä");
- return(1);
- case 0xe4:
- printf("ä");
- return(1);
- case 0xdc:
- printf("Ü");
- return(1);
- case 0xfc:
- printf("ü");
- return(1);
- case 0xd6:
- printf("Ö");
- return(1);
- case 0xf6:
- printf("ö");
- return(1);
- case 0xdf:
- printf("ß");
- return(1);
+
+ As the output encoding for HTML was chosen as UTF-8,
+ we don't need Ä etc. etc. I removed all but sz
+ -- MV 6.4.2000
+ */
+
+ case 0xdf:
+ printf("ß");
+ return(1);
/* end german characters */
case 0x2026:
#if 0
@@ -863,7 +711,8 @@
this just looks awful in netscape 4.5, so im going to do a very foolish
thing and just put ... instead of this
*/
- printf("…"); /*is there a proper html name for ...
&ellipse; ?*/
+ printf("…");
+/*is there a proper html name for ... &ellipse;? Yes, … -- MV */
#endif
printf("...");
return(1);
@@ -879,6 +728,48 @@
case 0x2018:
printf("`");
return(1);
+
+ /* Windows specials (MV): */
+ case 0x0160:
+ printf("Š");
+ return(1);
+ case 0x0161:
+ printf("š");
+ return(1);
+ case 0x2014:
+ printf("—");
+ return(1);
+ case 0x201c:
+ printf("“"); /* inverted double quotation mark */
+ return(1);
+ case 0x201d:
+ printf("”"); /* double q.m. */
+ return(1);
+ case 0x2020:
+ printf("†");
+ return(1);
+ case 0x2021:
+ printf("‡");
+ return(1);
+ case 0x2022:
+ printf("•");
+ return(1);
+ case 0x0152:
+ printf("Œ");
+ return(1);
+ case 0x0153:
+ printf("œ");
+ return(1);
+ case 0x0178:
+ printf("Ÿ");
+ return(1);
+ case 0x2030:
+ printf("‰");
+ return(1);
+ case 0x20ac:
+ printf("€");
+ return(1);
+
}
return(0);
}