At 3:03 pm +0100 23/1/01, Lars G. Skjellerup wrote:
| Can somebody please help?
|
| I get a file (tab delimited) from a mac Filemaker and I need to get
| all the "weird" mac chars converted to win32 chars (for saving) and
| from there to html. The later part is not a big problem, but the
| first...
|
| I have tried to do:
|
| $lin =~ s///g;
| $lin =~ s///g;
| $lin =~ s///g;
| $lin =~ s//2/g;
| $lin =~ s//"/g;
| $lin =~ s//'/g;
I never use a variable for the line but rely on $_ ; I would only use the "=~"
construction if I were putting the value of $a into $b.
while (<INFILEHANDLE>) {
s~x~y~g;
s~a~b~g;
print OUTFILEHANDLE;
}
|
| But after a while I get a little tired of having to add a new "weird"
| char every time I discover yet another one.
|
| Isn't there somewhere out there a nice person that have a complete
| list or even better have a piece of code to do it for me ?
I would personally not use windows-32 in the content-type declaration but state just
"text/html" period and use Unicode mapping for the characters that are not in the
range up to 0x00FF, however the table below is what you asked for and _does_ demand a
windows-32 charset declaration. Those Mac characters that don't convert are given the
Unicode value.
Though there are now verbal entities such as "“" for '³', I think (not sure)
older browsers (say IE4.0) may not understand them whereas they do understand the hex.
Netscape in my experience understands very little about anything.
JD
%macToWin32HTML = (
"\x80"=>"Ä", #LATIN CAPITAL LETTER A WITH DIAERESIS
"\x81"=>"Å", #LATIN CAPITAL LETTER A WITH RING ABOVE
"\x82"=>"Ç", #LATIN CAPITAL LETTER C WITH CEDILLA
"\x83"=>"É", #LATIN CAPITAL LETTER E WITH ACUTE
"\x84"=>"Ñ", #LATIN CAPITAL LETTER N WITH TILDE
"\x85"=>"Ö", #LATIN CAPITAL LETTER O WITH DIAERESIS
"\x86"=>"Ü", #LATIN CAPITAL LETTER U WITH DIAERESIS
"\x87"=>"á", #LATIN SMALL LETTER A WITH ACUTE
"\x88"=>"à", #LATIN SMALL LETTER A WITH GRAVE
"\x89"=>"â", #LATIN SMALL LETTER A WITH CIRCUMFLEX
"\x8A"=>"ä", #LATIN SMALL LETTER A WITH DIAERESIS
"\x8B"=>"ã", #LATIN SMALL LETTER A WITH TILDE
"\x8C"=>"å", #LATIN SMALL LETTER A WITH RING ABOVE
"\x8D"=>"ç", #LATIN SMALL LETTER C WITH CEDILLA
"\x8E"=>"é", #LATIN SMALL LETTER E WITH ACUTE
"\x8F"=>"è", #LATIN SMALL LETTER E WITH GRAVE
"\x90"=>"ê", #LATIN SMALL LETTER E WITH CIRCUMFLEX
"\x91"=>"ë", #LATIN SMALL LETTER E WITH DIAERESIS
"\x92"=>"í", #LATIN SMALL LETTER I WITH ACUTE
"\x93"=>"ì", #LATIN SMALL LETTER I WITH GRAVE
"\x94"=>"î", #LATIN SMALL LETTER I WITH CIRCUMFLEX
"\x95"=>"ï", #LATIN SMALL LETTER I WITH DIAERESIS
"\x96"=>"ñ", #LATIN SMALL LETTER N WITH TILDE
"\x97"=>"ó", #LATIN SMALL LETTER O WITH ACUTE
"\x98"=>"ò", #LATIN SMALL LETTER O WITH GRAVE
"\x99"=>"ô", #LATIN SMALL LETTER O WITH CIRCUMFLEX
"\x9A"=>"ö", #LATIN SMALL LETTER O WITH DIAERESIS
"\x9B"=>"õ", #LATIN SMALL LETTER O WITH TILDE
"\x9C"=>"ú", #LATIN SMALL LETTER U WITH ACUTE
"\x9D"=>"ù", #LATIN SMALL LETTER U WITH GRAVE
"\x9E"=>"û", #LATIN SMALL LETTER U WITH CIRCUMFLEX
"\x9F"=>"ü", #LATIN SMALL LETTER U WITH DIAERESIS
"\xA0"=>"†", #DAGGER
"\xA1"=>"°", #DEGREE SIGN
"\xA2"=>"¢", #CENT SIGN
"\xA3"=>"£", #POUND SIGN
"\xA4"=>"§", #SECTION SIGN
"\xA5"=>"•", #BULLET
"\xA6"=>"¶", #PILCROW SIGN
"\xA7"=>"ß", #LATIN SMALL LETTER SHARP S
"\xA8"=>"®", #REGISTERED SIGN
"\xA9"=>"©", #COPYRIGHT SIGN
"\xAA"=>"™", #TRADE MARK SIGN
"\xAB"=>"´", #ACUTE ACCENT
"\xAC"=>"¨", #DIAERESIS
"\xAD"=>"≠", #NOT EQUAL TO
"\xAE"=>"Æ", #LATIN CAPITAL LIGATURE AE
"\xAF"=>"Ø", #LATIN CAPITAL LETTER O WITH STROKE
"\xB0"=>"∞", #INFINITY
"\xB1"=>"±", #PLUS-MINUS SIGN
"\xB2"=>"≤", #LESS-THAN OR EQUAL TO
"\xB3"=>"≥", #GREATER-THAN OR EQUAL TO
"\xB4"=>"¥", #YEN SIGN
"\xB5"=>"µ", #MICRO SIGN
"\xB6"=>"∂", #PARTIAL DIFFERENTIAL
"\xB7"=>"∑", #N-ARY SUMMATION
"\xB8"=>"∏", #N-ARY PRODUCT
"\xB9"=>"π", #GREEK SMALL LETTER PI
"\xBA"=>"∫", #INTEGRAL
"\xBB"=>"ª", #FEMININE ORDINAL INDICATOR
"\xBC"=>"º", #MASCULINE ORDINAL INDICATOR
"\xBD"=>"Ω", #OHM SIGN
"\xBE"=>"æ", #LATIN SMALL LIGATURE AE
"\xBF"=>"ø", #LATIN SMALL LETTER O WITH STROKE
"\xC0"=>"¿", #INVERTED QUESTION MARK
"\xC1"=>"¡", #INVERTED EXCLAMATION MARK
"\xC2"=>"¬", #NOT SIGN
"\xC3"=>"√", #SQUARE ROOT
"\xC4"=>"ƒ", #LATIN SMALL LETTER F WITH HOOK
"\xC5"=>"≈", #ALMOST EQUAL TO
"\xC6"=>"∆", #INCREMENT
"\xC7"=>"«", #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
"\xC8"=>"»", #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
"\xC9"=>"…", #HORIZONTAL ELLIPSIS
"\xCA"=>" ", #NO-BREAK SPACE
"\xCB"=>"À", #LATIN CAPITAL LETTER A WITH GRAVE
"\xCC"=>"Ã", #LATIN CAPITAL LETTER A WITH TILDE
"\xCD"=>"Õ", #LATIN CAPITAL LETTER O WITH TILDE
"\xCE"=>"Œ", #LATIN CAPITAL LIGATURE OE
"\xCF"=>"œ", #LATIN SMALL LIGATURE OE
"\xD0"=>"–", #EN DASH
"\xD1"=>"—", #EM DASH
"\xD2"=>"“", #LEFT DOUBLE QUOTATION MARK
"\xD3"=>"”", #RIGHT DOUBLE QUOTATION MARK
"\xD4"=>"‘", #LEFT SINGLE QUOTATION MARK
"\xD5"=>"’", #RIGHT SINGLE QUOTATION MARK
"\xD6"=>"÷", #DIVISION SIGN
"\xD7"=>"◊", #LOZENGE
"\xD8"=>"ÿ", #LATIN SMALL LETTER Y WITH DIAERESIS
"\xD9"=>"Ÿ", #LATIN CAPITAL LETTER Y WITH DIAERESIS
"\xDA"=>"⁄", #FRACTION SLASH
"\xDB"=>"€", # EURO SIGN
"\xDC"=>"‹", #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
"\xDD"=>"›", #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
"\xDE"=>"fi", #LATIN SMALL LIGATURE FI
"\xDF"=>"fl", #LATIN SMALL LIGATURE FL
"\xE0"=>"‡", #DOUBLE DAGGER
"\xE1"=>"·", #MIDDLE DOT
"\xE2"=>"‚", #SINGLE LOW-9 QUOTATION MARK
"\xE3"=>"„", #DOUBLE LOW-9 QUOTATION MARK
"\xE4"=>"‰", #PER MILLE SIGN
"\xE5"=>"Â", #LATIN CAPITAL LETTER A WITH CIRCUMFLEX
"\xE6"=>"Ê", #LATIN CAPITAL LETTER E WITH CIRCUMFLEX
"\xE7"=>"Á", #LATIN CAPITAL LETTER A WITH ACUTE
"\xE8"=>"Ë", #LATIN CAPITAL LETTER E WITH DIAERESIS
"\xE9"=>"È", #LATIN CAPITAL LETTER E WITH GRAVE
"\xEA"=>"Í", #LATIN CAPITAL LETTER I WITH ACUTE
"\xEB"=>"Î", #LATIN CAPITAL LETTER I WITH CIRCUMFLEX
"\xEC"=>"Ï", #LATIN CAPITAL LETTER I WITH DIAERESIS
"\xED"=>"Ì", #LATIN CAPITAL LETTER I WITH GRAVE
"\xEE"=>"Ó", #LATIN CAPITAL LETTER O WITH ACUTE
"\xEF"=>"Ô", #LATIN CAPITAL LETTER O WITH CIRCUMFLEX
"\xF0"=>"", # Apple logo
"\xF1"=>"Ò", #LATIN CAPITAL LETTER O WITH GRAVE
"\xF2"=>"Ú", #LATIN CAPITAL LETTER U WITH ACUTE
"\xF3"=>"Û", #LATIN CAPITAL LETTER U WITH CIRCUMFLEX
"\xF4"=>"Ù", #LATIN CAPITAL LETTER U WITH GRAVE
"\xF5"=>"ı", #LATIN SMALL LETTER DOTLESS I
"\xF6"=>"ˆ", #MODIFIER LETTER CIRCUMFLEX ACCENT
"\xF7"=>"˜", #SMALL TILDE
"\xF8"=>"¯", #MACRON
"\xF9"=>"˘", #BREVE
"\xFA"=>"˙", #DOT ABOVE
"\xFB"=>"˚", #RING ABOVE
"\xFC"=>"¸", #CEDILLA
"\xFD"=>"˝", #DOUBLE ACUTE ACCENT
"\xFE"=>"˛", #OGONEK
"\xFF"=>"ˇ", #CARON
);
####
$_ = "³j¹ai fêté...²\n";
s~([\x80-\xFF])~$macToWin32HTML{$1}~g;
print;