At 10:16 pm -0500 17/1/01, Ronald J Kimball wrote:
| On Thu, Jan 18, 2001 at 12:38:57PM +1030, Paul McCann wrote:
| > $_ = "Señor";
| > %table=(150=>"[0x00F1]",
| > 151=>"[0x00F2]"); #whatever the mappings are
| > s~([\x80-\xFF])~$table{ord($1)}~ge;
| > print;
| >
| > will get you started. The key is the "e" modifier..
|
| Although, since scalar hash values interpolate in double-quoted strings
| anyway, the /e is actually unnecessary with that replacement.
|
| s~([\x80-\xFF])~$table{ord($1)}~g;
|
| will work just as well.
Thank you both Paul and Ronald equally! It seems I was moving towards
it and it just needed your messages to clear the log jam.
Here's the end result, which is a filter to mark all 8-bit Mac
characters in a text by adding the {Latin-1} equivalent, where it
exists, and the <Unicode html string> for the character. I post it in
full since it might be useful to people after modification for
particular requirements. I shall be making another table to fill any
{blanks} that have windows-1252 equivalents.
%macToLatin1plus=(
128=>"{\xC4}<Ä>", # LATIN CAPITAL LETTER A WITH DIAERESIS
129=>"{\xC5}<Å>", # LATIN CAPITAL LETTER A WITH RING ABOVE
130=>"{\xC7}<Ç>", # LATIN CAPITAL LETTER C WITH CEDILLA
131=>"{\xC9}<É>", # LATIN CAPITAL LETTER E WITH ACUTE
132=>"{\xD1}<Ñ>", # LATIN CAPITAL LETTER N WITH TILDE
133=>"{\xD6}<Ö>", # LATIN CAPITAL LETTER O WITH DIAERESIS
134=>"{\xDC}<Ü>", # LATIN CAPITAL LETTER U WITH DIAERESIS
135=>"{\xE1}<á>", # LATIN SMALL LETTER A WITH ACUTE
136=>"{\xE0}<à>", # LATIN SMALL LETTER A WITH GRAVE
137=>"{\xE2}<â>", # LATIN SMALL LETTER A WITH CIRCUMFLEX
138=>"{\xE4}<ä>", # LATIN SMALL LETTER A WITH DIAERESIS
139=>"{\xE3}<ã>", # LATIN SMALL LETTER A WITH TILDE
140=>"{\xE5}<å>", # LATIN SMALL LETTER A WITH RING ABOVE
141=>"{\xE7}<ç>", # LATIN SMALL LETTER C WITH CEDILLA
142=>"{\xE9}<é>", # LATIN SMALL LETTER E WITH ACUTE
143=>"{\xE8}<è>", # LATIN SMALL LETTER E WITH GRAVE
144=>"{\xEA}<ê>", # LATIN SMALL LETTER E WITH CIRCUMFLEX
145=>"{\xEB}<ë>", # LATIN SMALL LETTER E WITH DIAERESIS
146=>"{\xED}<í>", # LATIN SMALL LETTER I WITH ACUTE
147=>"{\xEC}<ì>", # LATIN SMALL LETTER I WITH GRAVE
148=>"{\xEE}<î>", # LATIN SMALL LETTER I WITH CIRCUMFLEX
149=>"{\xEF}<ï>", # LATIN SMALL LETTER I WITH DIAERESIS
150=>"{\xF1}<ñ>", # LATIN SMALL LETTER N WITH TILDE
151=>"{\xF3}<ó>", # LATIN SMALL LETTER O WITH ACUTE
152=>"{\xF2}<ò>", # LATIN SMALL LETTER O WITH GRAVE
153=>"{\xF4}<ô>", # LATIN SMALL LETTER O WITH CIRCUMFLEX
154=>"{\xF6}<ö>", # LATIN SMALL LETTER O WITH DIAERESIS
155=>"{\xF5}<õ>", # LATIN SMALL LETTER O WITH TILDE
156=>"{\xFA}<ú>", # LATIN SMALL LETTER U WITH ACUTE
157=>"{\xF9}<ù>", # LATIN SMALL LETTER U WITH GRAVE
158=>"{\xFB}<û>", # LATIN SMALL LETTER U WITH CIRCUMFLEX
159=>"{\xFC}<ü>", # LATIN SMALL LETTER U WITH DIAERESIS
160=>"<†>", # DAGGER
161=>"{\xB0}<°>", # DEGREE SIGN
162=>"{\xA2}<¢>", # CENT SIGN
163=>"{\xA3}<£>", # POUND SIGN
164=>"{\xA7}<§>", # SECTION SIGN
165=>"<•>", # BULLET
166=>"{\xB6}<¶>", # PILCROW SIGN
167=>"{\xDF}<ß>", # LATIN SMALL LETTER SHARP S
168=>"{\xAE}<®>", # REGISTERED SIGN
169=>"{\xA9}<©>", # COPYRIGHT SIGN
170=>"<™>", # TRADE MARK SIGN
171=>"{\xB4}<´>", # ACUTE ACCENT
172=>"{\xA8}<¨>", # DIAERESIS
173=>"<≠>", # NOT EQUAL TO
174=>"{\xC6}<Æ>", # LATIN CAPITAL LETTER AE
175=>"{\xD8}<Ø>", # LATIN CAPITAL LETTER O WITH STROKE
176=>"<∞>", # INFINITY
177=>"{\xB1}<±>", # PLUS-MINUS SIGN
178=>"<≤>", # LESS-THAN OR EQUAL TO
179=>"<≥>", # GREATER-THAN OR EQUAL TO
180=>"{\xA5}<¥>", # YEN SIGN
181=>"{\xB5}<µ>", # MICRO SIGN
182=>"<∂>", # PARTIAL DIFFERENTIAL
183=>"<∑>", # N-ARY SUMMATION
184=>"<∏>", # N-ARY PRODUCT
185=>"<π>", # GREEK SMALL LETTER PI
186=>"<∫>", # INTEGRAL
187=>"<ª>", # FEMININE ORDINAL INDICATOR
188=>"{\xBA}<º>", # MASCULINE ORDINAL INDICATOR
189=>"<Ω>", # GREEK CAPITAL LETTER OMEGA
190=>"{\xE6}<æ>", # LATIN SMALL LETTER AE
191=>"{\xF8}<ø>", # LATIN SMALL LETTER O WITH STROKE
192=>"{\xBF}<¿>", # INVERTED QUESTION MARK
193=>"{\XCL}<¡>", # INVERTED EXCLAMATION MARK
194=>"{\xAC}<¬>", # NOT SIGN
195=>"<√>", # SQUARE ROOT
196=>"<ƒ>", # LATIN SMALL LETTER F WITH HOOK
197=>"<≈>", # ALMOST EQUAL TO
198=>"<∆>", # INCREMENT
199=>"{\xAB}<«>", # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
200=>"{\xBB}<»>", # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
201=>"<…>", # HORIZONTAL ELLIPSIS
202=>"{\xA0}< >", # NO-BREAK SPACE
203=>"{\xC0}<À>", # LATIN CAPITAL LETTER A WITH GRAVE
204=>"{\xC3}<Ã>", # LATIN CAPITAL LETTER A WITH TILDE
205=>"{\xD5}<Õ>", # LATIN CAPITAL LETTER O WITH TILDE
206=>"<Œ>", # LATIN CAPITAL LIGATURE OE
207=>"<œ>", # LATIN SMALL LIGATURE OE
208=>"<–>", # EN DASH
209=>"<—>", # EM DASH
210=>"<“>", # LEFT DOUBLE QUOTATION MARK
211=>"<”>", # RIGHT DOUBLE QUOTATION MARK
212=>"<‘>", # LEFT SINGLE QUOTATION MARK
213=>"<’>", # RIGHT SINGLE QUOTATION MARK
214=>"{\xF7}<÷>", # DIVISION SIGN
215=>"<◊>", # LOZENGE
216=>"{\xFF}<ÿ>", # LATIN SMALL LETTER Y WITH DIAERESIS
217=>"<Ÿ>", # LATIN CAPITAL LETTER Y WITH DIAERESIS
218=>"<⁄>", # FRACTION SLASH
219=>"<€>", # EURO SIGN
220=>"<‹>", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
221=>"<›>", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
222=>"<fi>", # LATIN SMALL LIGATURE FI
223=>"<fl>", # LATIN SMALL LIGATURE FL
224=>"<‡>", # DOUBLE DAGGER
225=>"{\xB7}<·>", # MIDDLE DOT
226=>"<‚>", # SINGLE LOW-9 QUOTATION MARK
227=>"<„>", # DOUBLE LOW-9 QUOTATION MARK
228=>"<‰>", # PER MILLE SIGN
229=>"{\xC2}<Â>", # LATIN CAPITAL LETTER A WITH CIRCUMFLEX
230=>"{\xCA}<Ê>", # LATIN CAPITAL LETTER E WITH CIRCUMFLEX
231=>"{\xC1}<Á>", # LATIN CAPITAL LETTER A WITH ACUTE
232=>"{\xCB}<Ë>", # LATIN CAPITAL LETTER E WITH DIAERESIS
233=>"{\xC8}<È>", # LATIN CAPITAL LETTER E WITH GRAVE
234=>"{\xCD}<Í>", # LATIN CAPITAL LETTER I WITH ACUTE
235=>"{\xCE}<Î>", # LATIN CAPITAL LETTER I WITH CIRCUMFLEX
236=>"{\xCF}<Ï>", # LATIN CAPITAL LETTER I WITH DIAERESIS
237=>"{\xCC}<Ì>", # LATIN CAPITAL LETTER I WITH GRAVE
238=>"{\xD3}<Ó>", # LATIN CAPITAL LETTER O WITH ACUTE
239=>"{\xD4}<Ô>", # LATIN CAPITAL LETTER O WITH CIRCUMFLEX
240=>"<>", # Apple logo
241=>"{\xD2}<Ò>", # LATIN CAPITAL LETTER O WITH GRAVE
242=>"{\xDA}<Ú>", # LATIN CAPITAL LETTER U WITH ACUTE
243=>"{\xDB}<Û>", # LATIN CAPITAL LETTER U WITH CIRCUMFLEX
244=>"{\xD9}<Ù>", # LATIN CAPITAL LETTER U WITH GRAVE
245=>"<ı>", # LATIN SMALL LETTER DOTLESS I
246=>"<ˆ>", # MODIFIER LETTER CIRCUMFLEX ACCENT
247=>"<˜>", # SMALL TILDE
248=>"{\xAF}<¯>", # MACRON
249=>"<˘>", # BREVE
250=>"<˙>", # DOT ABOVE
251=>"<˚>", # RING ABOVE
252=>"{\xB8}<¸>", # CEDILLA
253=>"<˝>", # DOUBLE ACUTE ACCENT
254=>"<˛>", # OGONEK
255=>"<ˇ>", # CARON
);
####### test string
$_ = '
¿Señor?
über
fêté
³Ah!²
';
####### end test
s~([\x80-\xFF])~$1$macToLatin1plus{ord($1)}~g;
print;