This patch is mainly a large extension to the approximate() method.
It's certainly not complete yet though...
There's also a couple of minor changes and a few comments that
should be looked at.
Andrew Dunbar.
--
http://linguaphile.sourceforge.net
Index: src/af/xap/xp/xap_EncodingManager.cpp
===================================================================
RCS file: /cvsroot/abi/src/af/xap/xp/xap_EncodingManager.cpp,v
retrieving revision 1.33
diff -u -r1.33 xap_EncodingManager.cpp
--- src/af/xap/xp/xap_EncodingManager.cpp 2001/05/25 05:52:12 1.33
+++ src/af/xap/xp/xap_EncodingManager.cpp 2001/06/03 07:18:20
@@ -81,6 +81,8 @@
char XAP_EncodingManager::fallbackChar(UT_UCSChar c) const
{
+ // TODO shouldn't we return U+FFFD "REPLACEMENT CHARACTER"
+ // TODO or U+25A0 "BLACK SQUARE" for Unicode?
return '?';
}
@@ -90,12 +92,878 @@
{
if (max_length==0)
return 0;
- if (max_length==1)
+ if (max_length>=3)
{
switch (c)
{
- case 0x201d:
- case 0x201c:
+ case 0x00A9: // COPYRIGHT SIGN
+ case 0x24B8: // CIRCLED LATIN CAPITAL LETTER C
+ strcpy(out,"(C)"); return 3;
+ case 0x00AE: // REGISTERED SIGN
+ case 0x24C7: // CIRCLED LATIN CAPITAL LETTER R
+ strcpy(out,"(R)"); return 3;
+ case 0xFB03: // LATIN SMALL LIGATURE FFI
+ strcpy(out,"ffi"); return 3;
+ case 0xFB04: // LATIN SMALL LIGATURE FFL
+ strcpy(out,"ffl"); return 3;
+ }
+ }
+ if (max_length>=2)
+ {
+ switch (c)
+ {
+ case 0x00C6: // LATIN CAPITAL LETTER AE
+ case 0x01E2: // LATIN CAPITAL LETTER AE WITH MACRON
+ case 0x01FC: // LATIN CAPITAL LETTER AE WITH ACUTE
+ strcpy(out,"AE"); return 2;
+ case 0x00DF: // LATIN SMALL LETTER SHARP S
+ strcpy(out,"ss"); return 2;
+ case 0x00E6: // LATIN SMALL LETTER AE
+ case 0x01E3: // LATIN SMALL LETTER AE WITH MACRON
+ case 0x01FD: // LATIN SMALL LETTER AE WITH ACUTE
+ strcpy(out,"ae"); return 2;
+ case 0x0132: // LATIN CAPITAL LIGATURE IJ
+ strcpy(out,"IJ"); return 2;
+ case 0x0133: // LATIN SMALL LIGATURE IJ
+ strcpy(out,"ij"); return 2;
+ case 0x0152: // LATIN CAPITAL LIGATURE OE
+ strcpy(out,"OE"); return 2;
+ case 0x0153: // LATIN SMALL LIGATURE OE
+ strcpy(out,"oe"); return 2;
+ case 0xFB00: // LATIN SMALL LIGATURE FF
+ strcpy(out,"ff"); return 2;
+ case 0xFB01: // LATIN SMALL LIGATURE FI
+ strcpy(out,"fi"); return 2;
+ case 0xFB02: // LATIN SMALL LIGATURE FL
+ strcpy(out,"fl"); return 2;
+ case 0xFB05: // LATIN SMALL LIGATURE LONG S T
+ case 0xFB06: // LATIN SMALL LIGATURE ST
+ strcpy(out,"st"); return 2;
+ }
+ }
+ if (max_length>=1)
+ {
+ switch (c)
+ {
+ case 0x00C0: // LATIN CAPITAL LETTER A WITH GRAVE
+ case 0x00C1: // LATIN CAPITAL LETTER A WITH ACUTE
+ case 0x00C2: // LATIN CAPITAL LETTER A WITH CIRCUMFLEX
+ case 0x00C3: // LATIN CAPITAL LETTER A WITH TILDE
+ case 0x00C4: // LATIN CAPITAL LETTER A WITH DIAERESIS
+ case 0x00C5: // LATIN CAPITAL LETTER A WITH RING ABOVE
+ case 0x0100: // LATIN CAPITAL LETTER A WITH MACRON
+ case 0x0102: // LATIN CAPITAL LETTER A WITH BREVE
+ case 0x0104: // LATIN CAPITAL LETTER A WITH OGONEK
+ case 0x01CD: // LATIN CAPITAL LETTER A WITH CARON
+ case 0x01DE: // LATIN CAPITAL LETTER A WITH DIAERESIS AND
+MACRON
+ case 0x01E0: // LATIN CAPITAL LETTER A WITH DOT ABOVE AND
+MACRON
+ case 0x01FA: // LATIN CAPITAL LETTER A WITH RING ABOVE AND
+ACUTE
+ case 0x0200: // LATIN CAPITAL LETTER A WITH DOUBLE GRAVE
+ case 0x0202: // LATIN CAPITAL LETTER A WITH INVERTED BREVE
+ case 0x1E00: // LATIN CAPITAL LETTER A WITH RING BELOW
+ case 0x1EA0: // LATIN CAPITAL LETTER A WITH DOT BELOW
+ case 0x1EA2: // LATIN CAPITAL LETTER A WITH HOOK ABOVE
+ case 0x1EA4: // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND
+ACUTE
+ case 0x1EA6: // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND
+GRAVE
+ case 0x1EA8: // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND
+HOOK ABOVE
+ case 0x1EAA: // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND
+TILDE
+ case 0x1EAC: // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND
+DOT BELOW
+ case 0x1EAE: // LATIN CAPITAL LETTER A WITH BREVE AND ACUTE
+ case 0x1EB0: // LATIN CAPITAL LETTER A WITH BREVE AND GRAVE
+ case 0x1EB2: // LATIN CAPITAL LETTER A WITH BREVE AND HOOK
+ABOVE
+ case 0x1EB4: // LATIN CAPITAL LETTER A WITH BREVE AND TILDE
+ case 0x1EB6: // LATIN CAPITAL LETTER A WITH BREVE AND DOT
+BELOW
+ case 0xFF21: // FULLWIDTH LATIN CAPITAL LETTER A
+ *out = 'A'; return 1;
+ case 0x0181: // LATIN CAPITAL LETTER B WITH HOOK
+ case 0x0182: // LATIN CAPITAL LETTER B WITH TOPBAR
+ case 0x1E02: // LATIN CAPITAL LETTER B WITH DOT ABOVE
+ case 0x1E04: // LATIN CAPITAL LETTER B WITH DOT BELOW
+ case 0x1E06: // LATIN CAPITAL LETTER B WITH LINE BELOW
+ case 0xFF22: // FULLWIDTH LATIN CAPITAL LETTER B
+ *out = 'B'; return 1;
+ case 0x00C7: // LATIN CAPITAL LETTER C WITH CEDILLA
+ case 0x0106: // LATIN CAPITAL LETTER C WITH ACUTE
+ case 0x0108: // LATIN CAPITAL LETTER C WITH CIRCUMFLEX
+ case 0x010A: // LATIN CAPITAL LETTER C WITH DOT ABOVE
+ case 0x010C: // LATIN CAPITAL LETTER C WITH CARON
+ case 0x0187: // LATIN CAPITAL LETTER C WITH HOOK
+ case 0x1E08: // LATIN CAPITAL LETTER C WITH CEDILLA AND
+ACUTE
+ case 0xFF23: // FULLWIDTH LATIN CAPITAL LETTER C
+ *out = 'C'; return 1;
+ case 0x010E: // LATIN CAPITAL LETTER D WITH CARON
+ case 0x0110: // LATIN CAPITAL LETTER D WITH STROKE
+ case 0x018A: // LATIN CAPITAL LETTER D WITH HOOK
+ case 0x018B: // LATIN CAPITAL LETTER D WITH TOPBAR
+ case 0x1E0A: // LATIN CAPITAL LETTER D WITH DOT ABOVE
+ case 0x1E0C: // LATIN CAPITAL LETTER D WITH DOT BELOW
+ case 0x1E0E: // LATIN CAPITAL LETTER D WITH LINE BELOW
+ case 0x1E10: // LATIN CAPITAL LETTER D WITH CEDILLA
+ case 0x1E12: // LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW
+ case 0xFF24: // FULLWIDTH LATIN CAPITAL LETTER D
+ *out = 'D'; return 1;
+ case 0x00C8: // LATIN CAPITAL LETTER E WITH GRAVE
+ case 0x00C9: // LATIN CAPITAL LETTER E WITH ACUTE
+ case 0x00CA: // LATIN CAPITAL LETTER E WITH CIRCUMFLEX
+ case 0x00CB: // LATIN CAPITAL LETTER E WITH DIAERESIS
+ case 0x0112: // LATIN CAPITAL LETTER E WITH MACRON
+ case 0x0114: // LATIN CAPITAL LETTER E WITH BREVE
+ case 0x0116: // LATIN CAPITAL LETTER E WITH DOT ABOVE
+ case 0x0118: // LATIN CAPITAL LETTER E WITH OGONEK
+ case 0x011A: // LATIN CAPITAL LETTER E WITH CARON
+ case 0x0204: // LATIN CAPITAL LETTER E WITH DOUBLE GRAVE
+ case 0x0206: // LATIN CAPITAL LETTER E WITH INVERTED BREVE
+ case 0x1E14: // LATIN CAPITAL LETTER E WITH MACRON AND GRAVE
+ case 0x1E16: // LATIN CAPITAL LETTER E WITH MACRON AND ACUTE
+ case 0x1E18: // LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW
+ case 0x1E1A: // LATIN CAPITAL LETTER E WITH TILDE BELOW
+ case 0x1E1C: // LATIN CAPITAL LETTER E WITH CEDILLA AND
+BREVE
+ case 0x1EB8: // LATIN CAPITAL LETTER E WITH DOT BELOW
+ case 0x1EBA: // LATIN CAPITAL LETTER E WITH HOOK ABOVE
+ case 0x1EBC: // LATIN CAPITAL LETTER E WITH TILDE
+ case 0x1EBE: // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND
+ACUTE
+ case 0x1EC0: // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND
+GRAVE
+ case 0x1EC2: // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND
+HOOK ABOVE
+ case 0x1EC4: // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND
+TILDE
+ case 0x1EC6: // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND
+DOT BELOW
+ case 0xFF25: // FULLWIDTH LATIN CAPITAL LETTER E
+ *out = 'E'; return 1;
+ case 0x0191: // LATIN CAPITAL LETTER F WITH HOOK
+ case 0x1E1E: // LATIN CAPITAL LETTER F WITH DOT ABOVE
+ case 0xFF26: // FULLWIDTH LATIN CAPITAL LETTER F
+ *out = 'F'; return 1;
+ case 0x011C: // LATIN CAPITAL LETTER G WITH CIRCUMFLEX
+ case 0x011E: // LATIN CAPITAL LETTER G WITH BREVE
+ case 0x0120: // LATIN CAPITAL LETTER G WITH DOT ABOVE
+ case 0x0122: // LATIN CAPITAL LETTER G WITH CEDILLA
+ case 0x0193: // LATIN CAPITAL LETTER G WITH HOOK
+ case 0x01E4: // LATIN CAPITAL LETTER G WITH STROKE
+ case 0x01E6: // LATIN CAPITAL LETTER G WITH CARON
+ case 0x01F4: // LATIN CAPITAL LETTER G WITH ACUTE
+ case 0x1E20: // LATIN CAPITAL LETTER G WITH MACRON
+ case 0xFF27: // FULLWIDTH LATIN CAPITAL LETTER G
+ *out = 'G'; return 1;
+ case 0x0124: // LATIN CAPITAL LETTER H WITH CIRCUMFLEX
+ case 0x0126: // LATIN CAPITAL LETTER H WITH STROKE
+ case 0x1E22: // LATIN CAPITAL LETTER H WITH DOT ABOVE
+ case 0x1E24: // LATIN CAPITAL LETTER H WITH DOT BELOW
+ case 0x1E26: // LATIN CAPITAL LETTER H WITH DIAERESIS
+ case 0x1E28: // LATIN CAPITAL LETTER H WITH CEDILLA
+ case 0x1E2A: // LATIN CAPITAL LETTER H WITH BREVE BELOW
+ case 0xFF28: // FULLWIDTH LATIN CAPITAL LETTER H
+ *out = 'H'; return 1;
+ case 0x00CC: // LATIN CAPITAL LETTER I WITH GRAVE
+ case 0x00CD: // LATIN CAPITAL LETTER I WITH ACUTE
+ case 0x00CE: // LATIN CAPITAL LETTER I WITH CIRCUMFLEX
+ case 0x00CF: // LATIN CAPITAL LETTER I WITH DIAERESIS
+ case 0x0128: // LATIN CAPITAL LETTER I WITH TILDE
+ case 0x012A: // LATIN CAPITAL LETTER I WITH MACRON
+ case 0x012C: // LATIN CAPITAL LETTER I WITH BREVE
+ case 0x012E: // LATIN CAPITAL LETTER I WITH OGONEK
+ case 0x0130: // LATIN CAPITAL LETTER I WITH DOT ABOVE
+ case 0x0197: // LATIN CAPITAL LETTER I WITH STROKE
+ case 0x01CF: // LATIN CAPITAL LETTER I WITH CARON
+ case 0x0208: // LATIN CAPITAL LETTER I WITH DOUBLE GRAVE
+ case 0x020A: // LATIN CAPITAL LETTER I WITH INVERTED BREVE
+ case 0x1E2C: // LATIN CAPITAL LETTER I WITH TILDE BELOW
+ case 0x1E2E: // LATIN CAPITAL LETTER I WITH DIAERESIS AND
+ACUTE
+ case 0x1EC8: // LATIN CAPITAL LETTER I WITH HOOK ABOVE
+ case 0x1ECA: // LATIN CAPITAL LETTER I WITH DOT BELOW
+ case 0xFF29: // FULLWIDTH LATIN CAPITAL LETTER I
+ *out = 'I'; return 1;
+ case 0x0134: // LATIN CAPITAL LETTER J WITH CIRCUMFLEX
+ case 0xFF2A: // FULLWIDTH LATIN CAPITAL LETTER J
+ *out = 'J'; return 1;
+ case 0x0136: // LATIN CAPITAL LETTER K WITH CEDILLA
+ case 0x0198: // LATIN CAPITAL LETTER K WITH HOOK
+ case 0x01E8: // LATIN CAPITAL LETTER K WITH CARON
+ case 0x1E30: // LATIN CAPITAL LETTER K WITH ACUTE
+ case 0x1E32: // LATIN CAPITAL LETTER K WITH DOT BELOW
+ case 0x1E34: // LATIN CAPITAL LETTER K WITH LINE BELOW
+ case 0xFF2B: // FULLWIDTH LATIN CAPITAL LETTER K
+ *out = 'K'; return 1;
+ case 0x0139: // LATIN CAPITAL LETTER L WITH ACUTE
+ case 0x013B: // LATIN CAPITAL LETTER L WITH CEDILLA
+ case 0x013D: // LATIN CAPITAL LETTER L WITH CARON
+ case 0x013F: // LATIN CAPITAL LETTER L WITH MIDDLE DOT
+ case 0x0141: // LATIN CAPITAL LETTER L WITH STROKE
+ case 0x1E36: // LATIN CAPITAL LETTER L WITH DOT BELOW
+ case 0x1E38: // LATIN CAPITAL LETTER L WITH DOT BELOW AND
+MACRON
+ case 0x1E3A: // LATIN CAPITAL LETTER L WITH LINE BELOW
+ case 0x1E3C: // LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW
+ case 0xFF2C: // FULLWIDTH LATIN CAPITAL LETTER L
+ *out = 'L'; return 1;
+ case 0x1E3E: // LATIN CAPITAL LETTER M WITH ACUTE
+ case 0x1E40: // LATIN CAPITAL LETTER M WITH DOT ABOVE
+ case 0x1E42: // LATIN CAPITAL LETTER M WITH DOT BELOW
+ case 0xFF2D: // FULLWIDTH LATIN CAPITAL LETTER M
+ *out = 'M'; return 1;
+ case 0x00D1: // LATIN CAPITAL LETTER N WITH TILDE
+ case 0x0143: // LATIN CAPITAL LETTER N WITH ACUTE
+ case 0x0145: // LATIN CAPITAL LETTER N WITH CEDILLA
+ case 0x0147: // LATIN CAPITAL LETTER N WITH CARON
+ case 0x019D: // LATIN CAPITAL LETTER N WITH LEFT HOOK
+ case 0x1E44: // LATIN CAPITAL LETTER N WITH DOT ABOVE
+ case 0x1E46: // LATIN CAPITAL LETTER N WITH DOT BELOW
+ case 0x1E48: // LATIN CAPITAL LETTER N WITH LINE BELOW
+ case 0x1E4A: // LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW
+ case 0xFF2E: // FULLWIDTH LATIN CAPITAL LETTER N
+ *out = 'N'; return 1;
+ case 0x00D2: // LATIN CAPITAL LETTER O WITH GRAVE
+ case 0x00D3: // LATIN CAPITAL LETTER O WITH ACUTE
+ case 0x00D4: // LATIN CAPITAL LETTER O WITH CIRCUMFLEX
+ case 0x00D5: // LATIN CAPITAL LETTER O WITH TILDE
+ case 0x00D6: // LATIN CAPITAL LETTER O WITH DIAERESIS
+ case 0x00D8: // LATIN CAPITAL LETTER O WITH STROKE
+ case 0x014C: // LATIN CAPITAL LETTER O WITH MACRON
+ case 0x014E: // LATIN CAPITAL LETTER O WITH BREVE
+ case 0x0150: // LATIN CAPITAL LETTER O WITH DOUBLE ACUTE
+ case 0x019F: // LATIN CAPITAL LETTER O WITH MIDDLE TILDE
+ case 0x01A0: // LATIN CAPITAL LETTER O WITH HORN
+ case 0x01D1: // LATIN CAPITAL LETTER O WITH CARON
+ case 0x01EA: // LATIN CAPITAL LETTER O WITH OGONEK
+ case 0x01EC: // LATIN CAPITAL LETTER O WITH OGONEK AND
+MACRON
+ case 0x01FE: // LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
+ case 0x020C: // LATIN CAPITAL LETTER O WITH DOUBLE GRAVE
+ case 0x020E: // LATIN CAPITAL LETTER O WITH INVERTED BREVE
+ case 0x1E4C: // LATIN CAPITAL LETTER O WITH TILDE AND ACUTE
+ case 0x1E4E: // LATIN CAPITAL LETTER O WITH TILDE AND
+DIAERESIS
+ case 0x1E50: // LATIN CAPITAL LETTER O WITH MACRON AND GRAVE
+ case 0x1E52: // LATIN CAPITAL LETTER O WITH MACRON AND ACUTE
+ case 0x1ECC: // LATIN CAPITAL LETTER O WITH DOT BELOW
+ case 0x1ECE: // LATIN CAPITAL LETTER O WITH HOOK ABOVE
+ case 0x1ED0: // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND
+ACUTE
+ case 0x1ED2: // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND
+GRAVE
+ case 0x1ED4: // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND
+HOOK ABOVE
+ case 0x1ED6: // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND
+TILDE
+ case 0x1ED8: // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND
+DOT BELOW
+ case 0x1EDA: // LATIN CAPITAL LETTER O WITH HORN AND ACUTE
+ case 0x1EDC: // LATIN CAPITAL LETTER O WITH HORN AND GRAVE
+ case 0x1EDE: // LATIN CAPITAL LETTER O WITH HORN AND HOOK
+ABOVE
+ case 0x1EE0: // LATIN CAPITAL LETTER O WITH HORN AND TILDE
+ case 0x1EE2: // LATIN CAPITAL LETTER O WITH HORN AND DOT
+BELOW
+ case 0xFF2F: // FULLWIDTH LATIN CAPITAL LETTER O
+ *out = 'O'; return 1;
+ case 0x01A4: // LATIN CAPITAL LETTER P WITH HOOK
+ case 0x1E54: // LATIN CAPITAL LETTER P WITH ACUTE
+ case 0x1E56: // LATIN CAPITAL LETTER P WITH DOT ABOVE
+ case 0xFF30: // FULLWIDTH LATIN CAPITAL LETTER P
+ *out = 'P'; return 1;
+ case 0xFF31: // FULLWIDTH LATIN CAPITAL LETTER Q
+ *out = 'Q'; return 1;
+ case 0x0154: // LATIN CAPITAL LETTER R WITH ACUTE
+ case 0x0156: // LATIN CAPITAL LETTER R WITH CEDILLA
+ case 0x0158: // LATIN CAPITAL LETTER R WITH CARON
+ case 0x0210: // LATIN CAPITAL LETTER R WITH DOUBLE GRAVE
+ case 0x0212: // LATIN CAPITAL LETTER R WITH INVERTED BREVE
+ case 0x1E58: // LATIN CAPITAL LETTER R WITH DOT ABOVE
+ case 0x1E5A: // LATIN CAPITAL LETTER R WITH DOT BELOW
+ case 0x1E5C: // LATIN CAPITAL LETTER R WITH DOT BELOW AND
+MACRON
+ case 0x1E5E: // LATIN CAPITAL LETTER R WITH LINE BELOW
+ case 0xFF32: // FULLWIDTH LATIN CAPITAL LETTER R
+ *out = 'R'; return 1;
+ case 0x015A: // LATIN CAPITAL LETTER S WITH ACUTE
+ case 0x015C: // LATIN CAPITAL LETTER S WITH CIRCUMFLEX
+ case 0x015E: // LATIN CAPITAL LETTER S WITH CEDILLA
+ case 0x0160: // LATIN CAPITAL LETTER S WITH CARON
+ case 0x1E60: // LATIN CAPITAL LETTER S WITH DOT ABOVE
+ case 0x1E62: // LATIN CAPITAL LETTER S WITH DOT BELOW
+ case 0x1E64: // LATIN CAPITAL LETTER S WITH ACUTE AND DOT
+ABOVE
+ case 0x1E66: // LATIN CAPITAL LETTER S WITH CARON AND DOT
+ABOVE
+ case 0x1E68: // LATIN CAPITAL LETTER S WITH DOT BELOW AND
+DOT ABOVE
+ case 0xFF33: // FULLWIDTH LATIN CAPITAL LETTER S
+ *out = 'S'; return 1;
+ case 0x0162: // LATIN CAPITAL LETTER T WITH CEDILLA
+ case 0x0164: // LATIN CAPITAL LETTER T WITH CARON
+ case 0x0166: // LATIN CAPITAL LETTER T WITH STROKE
+ case 0x01AC: // LATIN CAPITAL LETTER T WITH HOOK
+ case 0x01AE: // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
+ case 0x1E6A: // LATIN CAPITAL LETTER T WITH DOT ABOVE
+ case 0x1E6C: // LATIN CAPITAL LETTER T WITH DOT BELOW
+ case 0x1E6E: // LATIN CAPITAL LETTER T WITH LINE BELOW
+ case 0x1E70: // LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW
+ case 0xFF34: // FULLWIDTH LATIN CAPITAL LETTER T
+ *out = 'T'; return 1;
+ case 0x00D9: // LATIN CAPITAL LETTER U WITH GRAVE
+ case 0x00DA: // LATIN CAPITAL LETTER U WITH ACUTE
+ case 0x00DB: // LATIN CAPITAL LETTER U WITH CIRCUMFLEX
+ case 0x00DC: // LATIN CAPITAL LETTER U WITH DIAERESIS
+ case 0x0168: // LATIN CAPITAL LETTER U WITH TILDE
+ case 0x016A: // LATIN CAPITAL LETTER U WITH MACRON
+ case 0x016C: // LATIN CAPITAL LETTER U WITH BREVE
+ case 0x016E: // LATIN CAPITAL LETTER U WITH RING ABOVE
+ case 0x0170: // LATIN CAPITAL LETTER U WITH DOUBLE ACUTE
+ case 0x0172: // LATIN CAPITAL LETTER U WITH OGONEK
+ case 0x01AF: // LATIN CAPITAL LETTER U WITH HORN
+ case 0x01D3: // LATIN CAPITAL LETTER U WITH CARON
+ case 0x01D5: // LATIN CAPITAL LETTER U WITH DIAERESIS AND
+MACRON
+ case 0x01D7: // LATIN CAPITAL LETTER U WITH DIAERESIS AND
+ACUTE
+ case 0x01D9: // LATIN CAPITAL LETTER U WITH DIAERESIS AND
+CARON
+ case 0x01DB: // LATIN CAPITAL LETTER U WITH DIAERESIS AND
+GRAVE
+ case 0x0214: // LATIN CAPITAL LETTER U WITH DOUBLE GRAVE
+ case 0x0216: // LATIN CAPITAL LETTER U WITH INVERTED BREVE
+ case 0x1E72: // LATIN CAPITAL LETTER U WITH DIAERESIS BELOW
+ case 0x1E74: // LATIN CAPITAL LETTER U WITH TILDE BELOW
+ case 0x1E76: // LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW
+ case 0x1E78: // LATIN CAPITAL LETTER U WITH TILDE AND ACUTE
+ case 0x1E7A: // LATIN CAPITAL LETTER U WITH MACRON AND
+DIAERESIS
+ case 0x1EE4: // LATIN CAPITAL LETTER U WITH DOT BELOW
+ case 0x1EE6: // LATIN CAPITAL LETTER U WITH HOOK ABOVE
+ case 0x1EE8: // LATIN CAPITAL LETTER U WITH HORN AND ACUTE
+ case 0x1EEA: // LATIN CAPITAL LETTER U WITH HORN AND GRAVE
+ case 0x1EEC: // LATIN CAPITAL LETTER U WITH HORN AND HOOK
+ABOVE
+ case 0x1EEE: // LATIN CAPITAL LETTER U WITH HORN AND TILDE
+ case 0x1EF0: // LATIN CAPITAL LETTER U WITH HORN AND DOT
+BELOW
+ case 0xFF35: // FULLWIDTH LATIN CAPITAL LETTER U
+ *out = 'U'; return 1;
+ case 0x01B2: // LATIN CAPITAL LETTER V WITH HOOK
+ case 0x1E7C: // LATIN CAPITAL LETTER V WITH TILDE
+ case 0x1E7E: // LATIN CAPITAL LETTER V WITH DOT BELOW
+ case 0xFF36: // FULLWIDTH LATIN CAPITAL LETTER V
+ *out = 'V'; return 1;
+ case 0x0174: // LATIN CAPITAL LETTER W WITH CIRCUMFLEX
+ case 0x1E80: // LATIN CAPITAL LETTER W WITH GRAVE
+ case 0x1E82: // LATIN CAPITAL LETTER W WITH ACUTE
+ case 0x1E84: // LATIN CAPITAL LETTER W WITH DIAERESIS
+ case 0x1E86: // LATIN CAPITAL LETTER W WITH DOT ABOVE
+ case 0x1E88: // LATIN CAPITAL LETTER W WITH DOT BELOW
+ case 0xFF37: // FULLWIDTH LATIN CAPITAL LETTER W
+ *out = 'W'; return 1;
+ case 0x1E8A: // LATIN CAPITAL LETTER X WITH DOT ABOVE
+ case 0x1E8C: // LATIN CAPITAL LETTER X WITH DIAERESIS
+ case 0xFF38: // FULLWIDTH LATIN CAPITAL LETTER X
+ *out = 'X'; return 1;
+ case 0x00DD: // LATIN CAPITAL LETTER Y WITH ACUTE
+ case 0x0176: // LATIN CAPITAL LETTER Y WITH CIRCUMFLEX
+ case 0x0178: // LATIN CAPITAL LETTER Y WITH DIAERESIS
+ case 0x01B3: // LATIN CAPITAL LETTER Y WITH HOOK
+ case 0x1E8E: // LATIN CAPITAL LETTER Y WITH DOT ABOVE
+ case 0x1EF2: // LATIN CAPITAL LETTER Y WITH GRAVE
+ case 0x1EF4: // LATIN CAPITAL LETTER Y WITH DOT BELOW
+ case 0x1EF6: // LATIN CAPITAL LETTER Y WITH HOOK ABOVE
+ case 0x1EF8: // LATIN CAPITAL LETTER Y WITH TILDE
+ case 0xFF39: // FULLWIDTH LATIN CAPITAL LETTER Y
+ *out = 'Y'; return 1;
+ case 0x0179: // LATIN CAPITAL LETTER Z WITH ACUTE
+ case 0x017B: // LATIN CAPITAL LETTER Z WITH DOT ABOVE
+ case 0x017D: // LATIN CAPITAL LETTER Z WITH CARON
+ *out = 'Z'; return 1;
+ case 0x00E0: // LATIN SMALL LETTER A WITH GRAVE
+ case 0x00E1: // LATIN SMALL LETTER A WITH ACUTE
+ case 0x00E2: // LATIN SMALL LETTER A WITH CIRCUMFLEX
+ case 0x00E3: // LATIN SMALL LETTER A WITH TILDE
+ case 0x00E4: // LATIN SMALL LETTER A WITH DIAERESIS
+ case 0x00E5: // LATIN SMALL LETTER A WITH RING ABOVE
+ case 0x0101: // LATIN SMALL LETTER A WITH MACRON
+ case 0x0103: // LATIN SMALL LETTER A WITH BREVE
+ case 0x0105: // LATIN SMALL LETTER A WITH OGONEK
+ case 0x01CE: // LATIN SMALL LETTER A WITH CARON
+ case 0x01DF: // LATIN SMALL LETTER A WITH DIAERESIS AND
+MACRON
+ case 0x01E1: // LATIN SMALL LETTER A WITH DOT ABOVE AND
+MACRON
+ case 0x01FB: // LATIN SMALL LETTER A WITH RING ABOVE AND
+ACUTE
+ case 0x0201: // LATIN SMALL LETTER A WITH DOUBLE GRAVE
+ case 0x0203: // LATIN SMALL LETTER A WITH INVERTED BREVE
+ case 0x1E01: // LATIN SMALL LETTER A WITH RING BELOW
+ case 0x1E9A: // LATIN SMALL LETTER A WITH RIGHT HALF RING
+ case 0x1EA1: // LATIN SMALL LETTER A WITH DOT BELOW
+ case 0x1EA3: // LATIN SMALL LETTER A WITH HOOK ABOVE
+ case 0x1EA5: // LATIN SMALL LETTER A WITH CIRCUMFLEX AND
+ACUTE
+ case 0x1EA7: // LATIN SMALL LETTER A WITH CIRCUMFLEX AND
+GRAVE
+ case 0x1EA9: // LATIN SMALL LETTER A WITH CIRCUMFLEX AND
+HOOK ABOVE
+ case 0x1EAB: // LATIN SMALL LETTER A WITH CIRCUMFLEX AND
+TILDE
+ case 0x1EAD: // LATIN SMALL LETTER A WITH CIRCUMFLEX AND
+DOT BELOW
+ case 0x1EAF: // LATIN SMALL LETTER A WITH BREVE AND ACUTE
+ case 0x1EB1: // LATIN SMALL LETTER A WITH BREVE AND GRAVE
+ case 0x1EB3: // LATIN SMALL LETTER A WITH BREVE AND HOOK
+ABOVE
+ case 0x1EB5: // LATIN SMALL LETTER A WITH BREVE AND TILDE
+ case 0x1EB7: // LATIN SMALL LETTER A WITH BREVE AND DOT
+BELOW
+ case 0xFF41: // FULLWIDTH LATIN SMALL LETTER A
+ *out = 'a'; return 1;
+ case 0x0180: // LATIN SMALL LETTER B WITH STROKE
+ case 0x0183: // LATIN SMALL LETTER B WITH TOPBAR
+ case 0x0253: // LATIN SMALL LETTER B WITH HOOK
+ case 0x1E03: // LATIN SMALL LETTER B WITH DOT ABOVE
+ case 0x1E05: // LATIN SMALL LETTER B WITH DOT BELOW
+ case 0x1E07: // LATIN SMALL LETTER B WITH LINE BELOW
+ case 0xFF42: // FULLWIDTH LATIN SMALL LETTER B
+ *out = 'b'; return 1;
+ case 0x00E7: // LATIN SMALL LETTER C WITH CEDILLA
+ case 0x0107: // LATIN SMALL LETTER C WITH ACUTE
+ case 0x0109: // LATIN SMALL LETTER C WITH CIRCUMFLEX
+ case 0x010B: // LATIN SMALL LETTER C WITH DOT ABOVE
+ case 0x010D: // LATIN SMALL LETTER C WITH CARON
+ case 0x0188: // LATIN SMALL LETTER C WITH HOOK
+ case 0x0255: // LATIN SMALL LETTER C WITH CURL
+ case 0x1E09: // LATIN SMALL LETTER C WITH CEDILLA AND ACUTE
+ case 0xFF43: // FULLWIDTH LATIN SMALL LETTER C
+ *out = 'c'; return 1;
+ case 0x010F: // LATIN SMALL LETTER D WITH CARON
+ case 0x0111: // LATIN SMALL LETTER D WITH STROKE
+ case 0x018C: // LATIN SMALL LETTER D WITH TOPBAR
+ case 0x0256: // LATIN SMALL LETTER D WITH TAIL
+ case 0x0257: // LATIN SMALL LETTER D WITH HOOK
+ case 0x1E0B: // LATIN SMALL LETTER D WITH DOT ABOVE
+ case 0x1E0D: // LATIN SMALL LETTER D WITH DOT BELOW
+ case 0x1E0F: // LATIN SMALL LETTER D WITH LINE BELOW
+ case 0x1E11: // LATIN SMALL LETTER D WITH CEDILLA
+ case 0x1E13: // LATIN SMALL LETTER D WITH CIRCUMFLEX BELOW
+ case 0xFF44: // FULLWIDTH LATIN SMALL LETTER D
+ *out = 'd'; return 1;
+ case 0x00E8: // LATIN SMALL LETTER E WITH GRAVE
+ case 0x00E9: // LATIN SMALL LETTER E WITH ACUTE
+ case 0x00EA: // LATIN SMALL LETTER E WITH CIRCUMFLEX
+ case 0x00EB: // LATIN SMALL LETTER E WITH DIAERESIS
+ case 0x0113: // LATIN SMALL LETTER E WITH MACRON
+ case 0x0115: // LATIN SMALL LETTER E WITH BREVE
+ case 0x0117: // LATIN SMALL LETTER E WITH DOT ABOVE
+ case 0x0119: // LATIN SMALL LETTER E WITH OGONEK
+ case 0x011B: // LATIN SMALL LETTER E WITH CARON
+ case 0x0205: // LATIN SMALL LETTER E WITH DOUBLE GRAVE
+ case 0x0207: // LATIN SMALL LETTER E WITH INVERTED BREVE
+ case 0x1E15: // LATIN SMALL LETTER E WITH MACRON AND GRAVE
+ case 0x1E17: // LATIN SMALL LETTER E WITH MACRON AND ACUTE
+ case 0x1E19: // LATIN SMALL LETTER E WITH CIRCUMFLEX BELOW
+ case 0x1E1B: // LATIN SMALL LETTER E WITH TILDE BELOW
+ case 0x1E1D: // LATIN SMALL LETTER E WITH CEDILLA AND BREVE
+ case 0x1EB9: // LATIN SMALL LETTER E WITH DOT BELOW
+ case 0x1EBB: // LATIN SMALL LETTER E WITH HOOK ABOVE
+ case 0x1EBD: // LATIN SMALL LETTER E WITH TILDE
+ case 0x1EBF: // LATIN SMALL LETTER E WITH CIRCUMFLEX AND
+ACUTE
+ case 0x1EC1: // LATIN SMALL LETTER E WITH CIRCUMFLEX AND
+GRAVE
+ case 0x1EC3: // LATIN SMALL LETTER E WITH CIRCUMFLEX AND
+HOOK ABOVE
+ case 0x1EC5: // LATIN SMALL LETTER E WITH CIRCUMFLEX AND
+TILDE
+ case 0x1EC7: // LATIN SMALL LETTER E WITH CIRCUMFLEX AND
+DOT BELOW
+ case 0xFF45: // FULLWIDTH LATIN SMALL LETTER E
+ *out = 'e'; return 1;
+ case 0x0192: // LATIN SMALL LETTER F WITH HOOK
+ case 0x1E1F: // LATIN SMALL LETTER F WITH DOT ABOVE
+ case 0xFF46: // FULLWIDTH LATIN SMALL LETTER F
+ *out = 'f'; return 1;
+ case 0x011D: // LATIN SMALL LETTER G WITH CIRCUMFLEX
+ case 0x011F: // LATIN SMALL LETTER G WITH BREVE
+ case 0x0121: // LATIN SMALL LETTER G WITH DOT ABOVE
+ case 0x0123: // LATIN SMALL LETTER G WITH CEDILLA
+ case 0x01E5: // LATIN SMALL LETTER G WITH STROKE
+ case 0x01E7: // LATIN SMALL LETTER G WITH CARON
+ case 0x01F5: // LATIN SMALL LETTER G WITH ACUTE
+ case 0x0260: // LATIN SMALL LETTER G WITH HOOK
+ case 0x1E21: // LATIN SMALL LETTER G WITH MACRON
+ case 0xFF47: // FULLWIDTH LATIN SMALL LETTER G
+ *out = 'g'; return 1;
+ case 0x0125: // LATIN SMALL LETTER H WITH CIRCUMFLEX
+ case 0x0127: // LATIN SMALL LETTER H WITH STROKE
+ case 0x0266: // LATIN SMALL LETTER H WITH HOOK
+ case 0x1E23: // LATIN SMALL LETTER H WITH DOT ABOVE
+ case 0x1E25: // LATIN SMALL LETTER H WITH DOT BELOW
+ case 0x1E27: // LATIN SMALL LETTER H WITH DIAERESIS
+ case 0x1E29: // LATIN SMALL LETTER H WITH CEDILLA
+ case 0x1E2B: // LATIN SMALL LETTER H WITH BREVE BELOW
+ case 0x1E96: // LATIN SMALL LETTER H WITH LINE BELOW
+ case 0xFF48: // FULLWIDTH LATIN SMALL LETTER H
+ *out = 'h'; return 1;
+ case 0x00EC: // LATIN SMALL LETTER I WITH GRAVE
+ case 0x00ED: // LATIN SMALL LETTER I WITH ACUTE
+ case 0x00EE: // LATIN SMALL LETTER I WITH CIRCUMFLEX
+ case 0x00EF: // LATIN SMALL LETTER I WITH DIAERESIS
+ case 0x0129: // LATIN SMALL LETTER I WITH TILDE
+ case 0x012B: // LATIN SMALL LETTER I WITH MACRON
+ case 0x012D: // LATIN SMALL LETTER I WITH BREVE
+ case 0x012F: // LATIN SMALL LETTER I WITH OGONEK
+ case 0x01D0: // LATIN SMALL LETTER I WITH CARON
+ case 0x0209: // LATIN SMALL LETTER I WITH DOUBLE GRAVE
+ case 0x020B: // LATIN SMALL LETTER I WITH INVERTED BREVE
+ case 0x0268: // LATIN SMALL LETTER I WITH STROKE
+ case 0x1E2D: // LATIN SMALL LETTER I WITH TILDE BELOW
+ case 0x1E2F: // LATIN SMALL LETTER I WITH DIAERESIS AND
+ACUTE
+ case 0x1EC9: // LATIN SMALL LETTER I WITH HOOK ABOVE
+ case 0x1ECB: // LATIN SMALL LETTER I WITH DOT BELOW
+ case 0xFF49: // FULLWIDTH LATIN SMALL LETTER I
+ *out = 'i'; return 1;
+ case 0x0135: // LATIN SMALL LETTER J WITH CIRCUMFLEX
+ case 0x01F0: // LATIN SMALL LETTER J WITH CARON
+ case 0x029D: // LATIN SMALL LETTER J WITH CROSSED-TAIL
+ case 0xFF4A: // FULLWIDTH LATIN SMALL LETTER J
+ *out = 'j'; return 1;
+ case 0x0137: // LATIN SMALL LETTER K WITH CEDILLA
+ case 0x0199: // LATIN SMALL LETTER K WITH HOOK
+ case 0x01E9: // LATIN SMALL LETTER K WITH CARON
+ case 0x1E31: // LATIN SMALL LETTER K WITH ACUTE
+ case 0x1E33: // LATIN SMALL LETTER K WITH DOT BELOW
+ case 0x1E35: // LATIN SMALL LETTER K WITH LINE BELOW
+ case 0xFF4B: // FULLWIDTH LATIN SMALL LETTER K
+ *out = 'k'; return 1;
+ case 0x013A: // LATIN SMALL LETTER L WITH ACUTE
+ case 0x013C: // LATIN SMALL LETTER L WITH CEDILLA
+ case 0x013E: // LATIN SMALL LETTER L WITH CARON
+ case 0x0140: // LATIN SMALL LETTER L WITH MIDDLE DOT
+ case 0x0142: // LATIN SMALL LETTER L WITH STROKE
+ case 0x019A: // LATIN SMALL LETTER L WITH BAR
+ case 0x026B: // LATIN SMALL LETTER L WITH MIDDLE TILDE
+ case 0x026C: // LATIN SMALL LETTER L WITH BELT
+ case 0x026D: // LATIN SMALL LETTER L WITH RETROFLEX HOOK
+ case 0x1E37: // LATIN SMALL LETTER L WITH DOT BELOW
+ case 0x1E39: // LATIN SMALL LETTER L WITH DOT BELOW AND
+MACRON
+ case 0x1E3B: // LATIN SMALL LETTER L WITH LINE BELOW
+ case 0x1E3D: // LATIN SMALL LETTER L WITH CIRCUMFLEX BELOW
+ case 0xFF4C: // FULLWIDTH LATIN SMALL LETTER L
+ *out = 'l'; return 1;
+ case 0x0271: // LATIN SMALL LETTER M WITH HOOK
+ case 0x1E3F: // LATIN SMALL LETTER M WITH ACUTE
+ case 0x1E41: // LATIN SMALL LETTER M WITH DOT ABOVE
+ case 0x1E43: // LATIN SMALL LETTER M WITH DOT BELOW
+ case 0xFF4D: // FULLWIDTH LATIN SMALL LETTER M
+ *out = 'm'; return 1;
+ case 0x00F1: // LATIN SMALL LETTER N WITH TILDE
+ case 0x0144: // LATIN SMALL LETTER N WITH ACUTE
+ case 0x0146: // LATIN SMALL LETTER N WITH CEDILLA
+ case 0x0148: // LATIN SMALL LETTER N WITH CARON
+ case 0x019E: // LATIN SMALL LETTER N WITH LONG RIGHT LEG
+ case 0x0272: // LATIN SMALL LETTER N WITH LEFT HOOK
+ case 0x0273: // LATIN SMALL LETTER N WITH RETROFLEX HOOK
+ case 0x1E45: // LATIN SMALL LETTER N WITH DOT ABOVE
+ case 0x1E47: // LATIN SMALL LETTER N WITH DOT BELOW
+ case 0x1E49: // LATIN SMALL LETTER N WITH LINE BELOW
+ case 0x1E4B: // LATIN SMALL LETTER N WITH CIRCUMFLEX BELOW
+ case 0xFF4E: // FULLWIDTH LATIN SMALL LETTER N
+ *out = 'n'; return 1;
+ case 0x00F2: // LATIN SMALL LETTER O WITH GRAVE
+ case 0x00F3: // LATIN SMALL LETTER O WITH ACUTE
+ case 0x00F4: // LATIN SMALL LETTER O WITH CIRCUMFLEX
+ case 0x00F5: // LATIN SMALL LETTER O WITH TILDE
+ case 0x00F6: // LATIN SMALL LETTER O WITH DIAERESIS
+ case 0x00F8: // LATIN SMALL LETTER O WITH STROKE
+ case 0x014D: // LATIN SMALL LETTER O WITH MACRON
+ case 0x014F: // LATIN SMALL LETTER O WITH BREVE
+ case 0x0151: // LATIN SMALL LETTER O WITH DOUBLE ACUTE
+ case 0x01A1: // LATIN SMALL LETTER O WITH HORN
+ case 0x01D2: // LATIN SMALL LETTER O WITH CARON
+ case 0x01EB: // LATIN SMALL LETTER O WITH OGONEK
+ case 0x01ED: // LATIN SMALL LETTER O WITH OGONEK AND MACRON
+ case 0x01FF: // LATIN SMALL LETTER O WITH STROKE AND ACUTE
+ case 0x020D: // LATIN SMALL LETTER O WITH DOUBLE GRAVE
+ case 0x020F: // LATIN SMALL LETTER O WITH INVERTED BREVE
+ case 0x1E4D: // LATIN SMALL LETTER O WITH TILDE AND ACUTE
+ case 0x1E4F: // LATIN SMALL LETTER O WITH TILDE AND
+DIAERESIS
+ case 0x1E51: // LATIN SMALL LETTER O WITH MACRON AND GRAVE
+ case 0x1E53: // LATIN SMALL LETTER O WITH MACRON AND ACUTE
+ case 0x1ECD: // LATIN SMALL LETTER O WITH DOT BELOW
+ case 0x1ECF: // LATIN SMALL LETTER O WITH HOOK ABOVE
+ case 0x1ED1: // LATIN SMALL LETTER O WITH CIRCUMFLEX AND
+ACUTE
+ case 0x1ED3: // LATIN SMALL LETTER O WITH CIRCUMFLEX AND
+GRAVE
+ case 0x1ED5: // LATIN SMALL LETTER O WITH CIRCUMFLEX AND
+HOOK ABOVE
+ case 0x1ED7: // LATIN SMALL LETTER O WITH CIRCUMFLEX AND
+TILDE
+ case 0x1ED9: // LATIN SMALL LETTER O WITH CIRCUMFLEX AND
+DOT BELOW
+ case 0x1EDB: // LATIN SMALL LETTER O WITH HORN AND ACUTE
+ case 0x1EDD: // LATIN SMALL LETTER O WITH HORN AND GRAVE
+ case 0x1EDF: // LATIN SMALL LETTER O WITH HORN AND HOOK
+ABOVE
+ case 0x1EE1: // LATIN SMALL LETTER O WITH HORN AND TILDE
+ case 0x1EE3: // LATIN SMALL LETTER O WITH HORN AND DOT BELOW
+ case 0xFF4F: // FULLWIDTH LATIN SMALL LETTER O
+ *out = 'o'; return 1;
+ case 0x01A5: // LATIN SMALL LETTER P WITH HOOK
+ case 0x1E55: // LATIN SMALL LETTER P WITH ACUTE
+ case 0x1E57: // LATIN SMALL LETTER P WITH DOT ABOVE
+ case 0xFF50: // FULLWIDTH LATIN SMALL LETTER P
+ *out = 'p'; return 1;
+ case 0x02A0: // LATIN SMALL LETTER Q WITH HOOK
+ case 0xFF51: // FULLWIDTH LATIN SMALL LETTER Q
+ *out = 'q'; return 1;
+ case 0x0155: // LATIN SMALL LETTER R WITH ACUTE
+ case 0x0157: // LATIN SMALL LETTER R WITH CEDILLA
+ case 0x0159: // LATIN SMALL LETTER R WITH CARON
+ case 0x0211: // LATIN SMALL LETTER R WITH DOUBLE GRAVE
+ case 0x0213: // LATIN SMALL LETTER R WITH INVERTED BREVE
+ case 0x027C: // LATIN SMALL LETTER R WITH LONG LEG
+ case 0x027D: // LATIN SMALL LETTER R WITH TAIL
+ case 0x027E: // LATIN SMALL LETTER R WITH FISHHOOK
+ case 0x1E59: // LATIN SMALL LETTER R WITH DOT ABOVE
+ case 0x1E5B: // LATIN SMALL LETTER R WITH DOT BELOW
+ case 0x1E5D: // LATIN SMALL LETTER R WITH DOT BELOW AND
+MACRON
+ case 0x1E5F: // LATIN SMALL LETTER R WITH LINE BELOW
+ case 0xFF52: // FULLWIDTH LATIN SMALL LETTER R
+ *out = 'r'; return 1;
+ case 0x015B: // LATIN SMALL LETTER S WITH ACUTE
+ case 0x015D: // LATIN SMALL LETTER S WITH CIRCUMFLEX
+ case 0x015F: // LATIN SMALL LETTER S WITH CEDILLA
+ case 0x0161: // LATIN SMALL LETTER S WITH CARON
+ case 0x0282: // LATIN SMALL LETTER S WITH HOOK
+ case 0x1E61: // LATIN SMALL LETTER S WITH DOT ABOVE
+ case 0x1E63: // LATIN SMALL LETTER S WITH DOT BELOW
+ case 0x1E65: // LATIN SMALL LETTER S WITH ACUTE AND DOT
+ABOVE
+ case 0x1E67: // LATIN SMALL LETTER S WITH CARON AND DOT
+ABOVE
+ case 0x1E69: // LATIN SMALL LETTER S WITH DOT BELOW AND DOT
+ABOVE
+ case 0xFF53: // FULLWIDTH LATIN SMALL LETTER S
+ *out = 's'; return 1;
+ case 0x0163: // LATIN SMALL LETTER T WITH CEDILLA
+ case 0x0165: // LATIN SMALL LETTER T WITH CARON
+ case 0x0167: // LATIN SMALL LETTER T WITH STROKE
+ case 0x01AB: // LATIN SMALL LETTER T WITH PALATAL HOOK
+ case 0x01AD: // LATIN SMALL LETTER T WITH HOOK
+ case 0x0288: // LATIN SMALL LETTER T WITH RETROFLEX HOOK
+ case 0x1E6B: // LATIN SMALL LETTER T WITH DOT ABOVE
+ case 0x1E6D: // LATIN SMALL LETTER T WITH DOT BELOW
+ case 0x1E6F: // LATIN SMALL LETTER T WITH LINE BELOW
+ case 0x1E71: // LATIN SMALL LETTER T WITH CIRCUMFLEX BELOW
+ case 0x1E97: // LATIN SMALL LETTER T WITH DIAERESIS
+ case 0xFF54: // FULLWIDTH LATIN SMALL LETTER T
+ *out = 't'; return 1;
+ case 0x00F9: // LATIN SMALL LETTER U WITH GRAVE
+ case 0x00FA: // LATIN SMALL LETTER U WITH ACUTE
+ case 0x00FB: // LATIN SMALL LETTER U WITH CIRCUMFLEX
+ case 0x00FC: // LATIN SMALL LETTER U WITH DIAERESIS
+ case 0x0169: // LATIN SMALL LETTER U WITH TILDE
+ case 0x016B: // LATIN SMALL LETTER U WITH MACRON
+ case 0x016D: // LATIN SMALL LETTER U WITH BREVE
+ case 0x016F: // LATIN SMALL LETTER U WITH RING ABOVE
+ case 0x0171: // LATIN SMALL LETTER U WITH DOUBLE ACUTE
+ case 0x0173: // LATIN SMALL LETTER U WITH OGONEK
+ case 0x01B0: // LATIN SMALL LETTER U WITH HORN
+ case 0x01D4: // LATIN SMALL LETTER U WITH CARON
+ case 0x01D6: // LATIN SMALL LETTER U WITH DIAERESIS AND
+MACRON
+ case 0x01D8: // LATIN SMALL LETTER U WITH DIAERESIS AND
+ACUTE
+ case 0x01DA: // LATIN SMALL LETTER U WITH DIAERESIS AND
+CARON
+ case 0x01DC: // LATIN SMALL LETTER U WITH DIAERESIS AND
+GRAVE
+ case 0x0215: // LATIN SMALL LETTER U WITH DOUBLE GRAVE
+ case 0x0217: // LATIN SMALL LETTER U WITH INVERTED BREVE
+ case 0x0289: // LATIN SMALL LETTER U BAR
+ case 0x1E73: // LATIN SMALL LETTER U WITH DIAERESIS BELOW
+ case 0x1E75: // LATIN SMALL LETTER U WITH TILDE BELOW
+ case 0x1E77: // LATIN SMALL LETTER U WITH CIRCUMFLEX BELOW
+ case 0x1E79: // LATIN SMALL LETTER U WITH TILDE AND ACUTE
+ case 0x1E7B: // LATIN SMALL LETTER U WITH MACRON AND
+DIAERESIS
+ case 0x1EE5: // LATIN SMALL LETTER U WITH DOT BELOW
+ case 0x1EE7: // LATIN SMALL LETTER U WITH HOOK ABOVE
+ case 0x1EE9: // LATIN SMALL LETTER U WITH HORN AND ACUTE
+ case 0x1EEB: // LATIN SMALL LETTER U WITH HORN AND GRAVE
+ case 0x1EED: // LATIN SMALL LETTER U WITH HORN AND HOOK
+ABOVE
+ case 0x1EEF: // LATIN SMALL LETTER U WITH HORN AND TILDE
+ case 0x1EF1: // LATIN SMALL LETTER U WITH HORN AND DOT BELOW
+ case 0xFF55: // FULLWIDTH LATIN SMALL LETTER U
+ *out = 'u'; return 1;
+ case 0x028B: // LATIN SMALL LETTER V WITH HOOK
+ case 0x1E7D: // LATIN SMALL LETTER V WITH TILDE
+ case 0x1E7F: // LATIN SMALL LETTER V WITH DOT BELOW
+ case 0xFF56: // FULLWIDTH LATIN SMALL LETTER V
+ *out = 'v'; return 1;
+ case 0x0175: // LATIN SMALL LETTER W WITH CIRCUMFLEX
+ case 0x1E81: // LATIN SMALL LETTER W WITH GRAVE
+ case 0x1E83: // LATIN SMALL LETTER W WITH ACUTE
+ case 0x1E85: // LATIN SMALL LETTER W WITH DIAERESIS
+ case 0x1E87: // LATIN SMALL LETTER W WITH DOT ABOVE
+ case 0x1E89: // LATIN SMALL LETTER W WITH DOT BELOW
+ case 0x1E98: // LATIN SMALL LETTER W WITH RING ABOVE
+ case 0xFF57: // FULLWIDTH LATIN SMALL LETTER W
+ *out = 'w'; return 1;
+ case 0x1E8B: // LATIN SMALL LETTER X WITH DOT ABOVE
+ case 0x1E8D: // LATIN SMALL LETTER X WITH DIAERESIS
+ case 0xFF58: // FULLWIDTH LATIN SMALL LETTER X
+ *out = 'x'; return 1;
+ case 0x00FD: // LATIN SMALL LETTER Y WITH ACUTE
+ case 0x00FF: // LATIN SMALL LETTER Y WITH DIAERESIS
+ case 0x0177: // LATIN SMALL LETTER Y WITH CIRCUMFLEX
+ case 0x01B4: // LATIN SMALL LETTER Y WITH HOOK
+ case 0x1E8F: // LATIN SMALL LETTER Y WITH DOT ABOVE
+ case 0x1E99: // LATIN SMALL LETTER Y WITH RING ABOVE
+ case 0x1EF3: // LATIN SMALL LETTER Y WITH GRAVE
+ case 0x1EF5: // LATIN SMALL LETTER Y WITH DOT BELOW
+ case 0x1EF7: // LATIN SMALL LETTER Y WITH HOOK ABOVE
+ case 0x1EF9: // LATIN SMALL LETTER Y WITH TILDE
+ case 0xFF59: // FULLWIDTH LATIN SMALL LETTER Y
+ *out = 'y'; return 1;
+ case 0x017A: // LATIN SMALL LETTER Z WITH ACUTE
+ case 0x017C: // LATIN SMALL LETTER Z WITH DOT ABOVE
+ case 0x017E: // LATIN SMALL LETTER Z WITH CARON
+ case 0x01B6: // LATIN SMALL LETTER Z WITH STROKE
+ case 0x0290: // LATIN SMALL LETTER Z WITH RETROFLEX HOOK
+ case 0x0291: // LATIN SMALL LETTER Z WITH CURL
+ case 0x1E91: // LATIN SMALL LETTER Z WITH CIRCUMFLEX
+ case 0x1E93: // LATIN SMALL LETTER Z WITH DOT BELOW
+ case 0x1E95: // LATIN SMALL LETTER Z WITH LINE BELOW
+ case 0xFF5A: // FULLWIDTH LATIN SMALL LETTER Z
+ *out = 'z'; return 1;
+ case 0x0660: // ARABIC-INDIC DIGIT ZERO
+ case 0x06F0: // EXTENDED ARABIC-INDIC DIGIT ZERO
+ case 0x0966: // DEVANAGARI DIGIT ZERO
+ case 0x09E6: // BENGALI DIGIT ZERO
+ case 0x0A66: // GURMUKHI DIGIT ZERO
+ case 0x0AE6: // GUJARATI DIGIT ZERO
+ case 0x0B66: // ORIYA DIGIT ZERO
+ case 0x0BE6: // TAMIL DIGIT ONE
+ case 0x0C66: // TELUGU DIGIT ZERO
+ case 0x0CE6: // KANNADA DIGIT ZERO
+ case 0x0D66: // MALAYALAM DIGIT ZERO
+ case 0x0E50: // THAI DIGIT ZERO
+ case 0x0ED0: // LAO DIGIT ZERO
+ case 0x0F20: // TIBETAN DIGIT ZERO
+ case 0xFF10: // FULLWIDTH DIGIT ZERO
+ *out = '0'; return 1;
+ case 0x0661: // ARABIC-INDIC DIGIT ONE
+ case 0x06F1: // EXTENDED ARABIC-INDIC DIGIT ONE
+ case 0x0967: // DEVANAGARI DIGIT ONE
+ case 0x09E7: // BENGALI DIGIT ONE
+ case 0x0A67: // GURMUKHI DIGIT ONE
+ case 0x0AE7: // GUJARATI DIGIT ONE
+ case 0x0B67: // ORIYA DIGIT ONE
+ case 0x0BE7: // TAMIL DIGIT ONE
+ case 0x0C67: // TELUGU DIGIT ONE
+ case 0x0CE7: // KANNADA DIGIT ONE
+ case 0x0D67: // MALAYALAM DIGIT ONE
+ case 0x0E51: // THAI DIGIT ONE
+ case 0x0ED1: // LAO DIGIT ONE
+ case 0x0F21: // TIBETAN DIGIT ONE
+ case 0xFF11: // FULLWIDTH DIGIT ONE
+ *out = '1'; return 1;
+ case 0x0662: // ARABIC-INDIC DIGIT TWO
+ case 0x06F2: // EXTENDED ARABIC-INDIC DIGIT TWO
+ case 0x0968: // DEVANAGARI DIGIT TWO
+ case 0x09E8: // BENGALI DIGIT TWO
+ case 0x0A68: // GURMUKHI DIGIT TWO
+ case 0x0AE8: // GUJARATI DIGIT TWO
+ case 0x0B68: // ORIYA DIGIT TWO
+ case 0x0BE8: // TAMIL DIGIT TWO
+ case 0x0C68: // TELUGU DIGIT TWO
+ case 0x0CE8: // KANNADA DIGIT TWO
+ case 0x0D68: // MALAYALAM DIGIT TWO
+ case 0x0E52: // THAI DIGIT TWO
+ case 0x0ED2: // LAO DIGIT TWO
+ case 0x0F22: // TIBETAN DIGIT TWO
+ case 0xFF12: // FULLWIDTH DIGIT TWO
+ *out = '2'; return 1;
+ case 0x0663: // ARABIC-INDIC DIGIT THREE
+ case 0x06F3: // EXTENDED ARABIC-INDIC DIGIT THREE
+ case 0x0969: // DEVANAGARI DIGIT THREE
+ case 0x09E9: // BENGALI DIGIT THREE
+ case 0x0A69: // GURMUKHI DIGIT THREE
+ case 0x0AE9: // GUJARATI DIGIT THREE
+ case 0x0B69: // ORIYA DIGIT THREE
+ case 0x0BE9: // TAMIL DIGIT THREE
+ case 0x0C69: // TELUGU DIGIT THREE
+ case 0x0CE9: // KANNADA DIGIT THREE
+ case 0x0D69: // MALAYALAM DIGIT THREE
+ case 0x0E53: // THAI DIGIT THREE
+ case 0x0ED3: // LAO DIGIT THREE
+ case 0x0F23: // TIBETAN DIGIT THREE
+ case 0xFF13: // FULLWIDTH DIGIT THREE
+ *out = '3'; return 1;
+ case 0x0664: // ARABIC-INDIC DIGIT FOUR
+ case 0x06F4: // EXTENDED ARABIC-INDIC DIGIT FOUR
+ case 0x096A: // DEVANAGARI DIGIT FOUR
+ case 0x09EA: // BENGALI DIGIT FOUR
+ case 0x0A6A: // GURMUKHI DIGIT FOUR
+ case 0x0AEA: // GUJARATI DIGIT FOUR
+ case 0x0B6A: // ORIYA DIGIT FOUR
+ case 0x0BEA: // TAMIL DIGIT FOUR
+ case 0x0C6A: // TELUGU DIGIT FOUR
+ case 0x0CEA: // KANNADA DIGIT FOUR
+ case 0x0D6A: // MALAYALAM DIGIT FOUR
+ case 0x0E54: // THAI DIGIT FOUR
+ case 0x0ED4: // LAO DIGIT FOUR
+ case 0x0F24: // TIBETAN DIGIT FOUR
+ case 0xFF14: // FULLWIDTH DIGIT FOUR
+ *out = '4'; return 1;
+ case 0x0665: // ARABIC-INDIC DIGIT FIVE
+ case 0x06F5: // EXTENDED ARABIC-INDIC DIGIT FIVE
+ case 0x096B: // DEVANAGARI DIGIT FIVE
+ case 0x09EB: // BENGALI DIGIT FIVE
+ case 0x0A6B: // GURMUKHI DIGIT FIVE
+ case 0x0AEB: // GUJARATI DIGIT FIVE
+ case 0x0B6B: // ORIYA DIGIT FIVE
+ case 0x0BEB: // TAMIL DIGIT FIVE
+ case 0x0C6B: // TELUGU DIGIT FIVE
+ case 0x0CEB: // KANNADA DIGIT FIVE
+ case 0x0D6B: // MALAYALAM DIGIT FIVE
+ case 0x0E55: // THAI DIGIT FIVE
+ case 0x0ED5: // LAO DIGIT FIVE
+ case 0x0F25: // TIBETAN DIGIT FIVE
+ case 0xFF15: // FULLWIDTH DIGIT FIVE
+ *out = '5'; return 1;
+ case 0x0666: // ARABIC-INDIC DIGIT SIX
+ case 0x06F6: // EXTENDED ARABIC-INDIC DIGIT SIX
+ case 0x096C: // DEVANAGARI DIGIT SIX
+ case 0x09EC: // BENGALI DIGIT SIX
+ case 0x0A6C: // GURMUKHI DIGIT SIX
+ case 0x0AEC: // GUJARATI DIGIT SIX
+ case 0x0B6C: // ORIYA DIGIT SIX
+ case 0x0BEC: // TAMIL DIGIT SIX
+ case 0x0C6C: // TELUGU DIGIT SIX
+ case 0x0CEC: // KANNADA DIGIT SIX
+ case 0x0D6C: // MALAYALAM DIGIT SIX
+ case 0x0E56: // THAI DIGIT SIX
+ case 0x0ED6: // LAO DIGIT SIX
+ case 0x0F26: // TIBETAN DIGIT SIX
+ case 0xFF16: // FULLWIDTH DIGIT SIX
+ *out = '6'; return 1;
+ case 0x0667: // ARABIC-INDIC DIGIT SEVEN
+ case 0x06F7: // EXTENDED ARABIC-INDIC DIGIT SEVEN
+ case 0x096D: // DEVANAGARI DIGIT SEVEN
+ case 0x09ED: // BENGALI DIGIT SEVEN
+ case 0x0A6D: // GURMUKHI DIGIT SEVEN
+ case 0x0AED: // GUJARATI DIGIT SEVEN
+ case 0x0B6D: // ORIYA DIGIT SEVEN
+ case 0x0BED: // TAMIL DIGIT SEVEN
+ case 0x0C6D: // TELUGU DIGIT SEVEN
+ case 0x0CED: // KANNADA DIGIT SEVEN
+ case 0x0D6D: // MALAYALAM DIGIT SEVEN
+ case 0x0E57: // THAI DIGIT SEVEN
+ case 0x0ED7: // LAO DIGIT SEVEN
+ case 0x0F27: // TIBETAN DIGIT SEVEN
+ case 0xFF17: // FULLWIDTH DIGIT SEVEN
+ *out = '7'; return 1;
+ case 0x0668: // ARABIC-INDIC DIGIT EIGHT
+ case 0x06F8: // EXTENDED ARABIC-INDIC DIGIT EIGHT
+ case 0x096E: // DEVANAGARI DIGIT EIGHT
+ case 0x09EE: // BENGALI DIGIT EIGHT
+ case 0x0A6E: // GURMUKHI DIGIT EIGHT
+ case 0x0AEE: // GUJARATI DIGIT EIGHT
+ case 0x0B6E: // ORIYA DIGIT EIGHT
+ case 0x0BEE: // TAMIL DIGIT EIGHT
+ case 0x0C6E: // TELUGU DIGIT EIGHT
+ case 0x0CEE: // KANNADA DIGIT EIGHT
+ case 0x0D6E: // MALAYALAM DIGIT EIGHT
+ case 0x0E58: // THAI DIGIT EIGHT
+ case 0x0ED8: // LAO DIGIT EIGHT
+ case 0x0F28: // TIBETAN DIGIT EIGHT
+ case 0xFF18: // FULLWIDTH DIGIT EIGHT
+ *out = '8'; return 1;
+ case 0x0669: // ARABIC-INDIC DIGIT NINE
+ case 0x06F9: // EXTENDED ARABIC-INDIC DIGIT NINE
+ case 0x096F: // DEVANAGARI DIGIT NINE
+ case 0x09EF: // BENGALI DIGIT NINE
+ case 0x0A6F: // GURMUKHI DIGIT NINE
+ case 0x0AEF: // GUJARATI DIGIT NINE
+ case 0x0B6F: // ORIYA DIGIT NINE
+ case 0x0BEF: // TAMIL DIGIT NINE
+ case 0x0C6F: // TELUGU DIGIT NINE
+ case 0x0CEF: // KANNADA DIGIT NINE
+ case 0x0D6F: // MALAYALAM DIGIT NINE
+ case 0x0E59: // THAI DIGIT NINE
+ case 0x0ED9: // LAO DIGIT NINE
+ case 0x0F29: // TIBETAN DIGIT NINE
+ case 0xFF19: // FULLWIDTH DIGIT NINE
+ *out = '9'; return 1;
+ case 0x00A1: // INVERTED EXCLAMATION MARK
+ *out = '!'; return 1;
+ case 0x00A6: // BROKEN BAR
+ *out = '|'; return 1;
+ case 0x00AD: // SOFT HYPHEN
+ case 0x02D7: // MODIFIER LETTER MINUS SIGN
+ case 0x2010: // HYPHEN
+ case 0x2011: // NON-BREAKING HYPHEN
+ case 0x2212: // MINUS SIGN
+ case 0xFE63: // SMALL HYPHEN-MINUS
+ case 0xFF0D: // FULLWIDTH HYPHEN-MINUS
+ *out = '-'; return 1;
+ case 0x00BF: // INVERTED QUESTION MARK
+ *out = '?'; return 1;
+ case 0x00D7: // MULTIPLICATION SIGN
+ *out = 'x'; return 1;
+ case 0x2018: // LEFT SINGLE QUOTATION MARK
+ case 0x2019: // RIGHT SINGLE QUOTATION MARK
+ case 0xFF07: // FULLWIDTH APOSTROPHE
+ *out = '\''; return 1;
+ case 0x201c: // LEFT DOUBLE QUOTATION MARK
+ case 0x201d: // RIGHT DOUBLE QUOTATION MARK
+ case 0xFF02: // FULLWIDTH QUOTATION MARK
*out = '"'; return 1;
default:
return 0;
@@ -486,6 +1354,10 @@
};
+/*
+ TODO I'm pretty sure you can't break Korean at any character.
+ And what about Japanese Katakana and Hiragana?
+*/
static const _rmap can_break_words_data[]=
{
{"0"}, /* default value - can't break words at any character. */
@@ -787,7 +1669,8 @@
}
{
if (cjk_locale()) {
- /* CJK guys should do something similar to 'else' branch */
+ /* CJK guys should do something similar to 'else' branch */
+
+ TexPrologue = " ";
} else {
char buf[500];
int len = 0;
@@ -876,6 +1759,11 @@
return TexPrologue;
};
+// Warning:
+// This code forces us to use "GB2312", "BIG5", etc instead
+// of "CP936", "CP950", etc even when our iconv supports
+// the "CPxxx" form and the encodings differ.
+// Be sure this is what you want if you call this function.
const char* XAP_EncodingManager::charsetFromCodepage(int lid) const
{
static char buf[100];
Index: src/af/xap/xp/xap_EncodingManager.h
===================================================================
RCS file: /cvsroot/abi/src/af/xap/xp/xap_EncodingManager.h,v
retrieving revision 1.19
diff -u -r1.19 xap_EncodingManager.h
--- src/af/xap/xp/xap_EncodingManager.h 2001/05/25 05:52:12 1.19
+++ src/af/xap/xp/xap_EncodingManager.h 2001/06/03 07:18:23
@@ -52,9 +52,14 @@
/*
this shouldn't return NULL. Don't free or write to returned string.
The string should be uppercased (extra font tarballs assume this).
+ TODO isn't iconv case sensitive? Mac encoding names are mixed case!
*/
virtual const char* getNativeEncodingName() const;
+ /*
+ This should return true for any Unicode locale:
+ UTF-8 on *nix, UCS-2 on Windows, etc
+ */
inline virtual bool isUnicodeLocale() const {return m_bIsUnicodeLocale;}
/*