At 3:03 pm +0100 23/1/01, Lars G. Skjellerup wrote:

|   Can somebody please help?
|   
|   I get a file (tab delimited) from a mac Filemaker and I need to get
|   all the "weird" mac chars converted to win32 chars (for saving) and
|   from there to html. The later part is not a big problem, but the
|   first...
|   
|   I have tried to do:
|   
|                   $lin =~ s///g;
|                   $lin =~ s///g;
|                   $lin =~ s///g;
|                   $lin =~ s//2/g;
|                   $lin =~ s//"/g;
|                   $lin =~ s//'/g;

I never use a variable for the line but rely on $_ ; I would only use the "=~" 
construction if I were putting the value of $a into $b. 


                    while (<INFILEHANDLE>) {
                        s~x~y~g;
                        s~a~b~g;
                        print OUTFILEHANDLE;
                    }

|   
|   But after a while I get a little tired of having to add a new "weird"
|   char every time I discover yet another one.
|   
|   Isn't there somewhere out there a nice person that have a complete
|   list or even better have a piece of code to do it for me ?


I would personally not use windows-32 in the content-type declaration but state just 
"text/html" period and use Unicode mapping for the characters that are not in the 
range up to 0x00FF, however the table below is what you asked for and _does_ demand a 
windows-32 charset declaration.  Those Mac characters that don't convert are given the 
Unicode value.

Though there are now verbal entities such as "&ldquo;" for '³', I think (not sure) 
older browsers (say IE4.0) may not understand them whereas they do understand the hex. 
 Netscape in my experience understands very little about anything.

JD


%macToWin32HTML = (
"\x80"=>"&#xC4;",       #LATIN CAPITAL LETTER A WITH DIAERESIS
"\x81"=>"&#xC5;",       #LATIN CAPITAL LETTER A WITH RING ABOVE
"\x82"=>"&#xC7;",       #LATIN CAPITAL LETTER C WITH CEDILLA
"\x83"=>"&#xC9;",       #LATIN CAPITAL LETTER E WITH ACUTE
"\x84"=>"&#xD1;",       #LATIN CAPITAL LETTER N WITH TILDE
"\x85"=>"&#xD6;",       #LATIN CAPITAL LETTER O WITH DIAERESIS
"\x86"=>"&#xDC;",       #LATIN CAPITAL LETTER U WITH DIAERESIS
"\x87"=>"&#xE1;",       #LATIN SMALL LETTER A WITH ACUTE
"\x88"=>"&#xE0;",       #LATIN SMALL LETTER A WITH GRAVE
"\x89"=>"&#xE2;",       #LATIN SMALL LETTER A WITH CIRCUMFLEX
"\x8A"=>"&#xE4;",       #LATIN SMALL LETTER A WITH DIAERESIS
"\x8B"=>"&#xE3;",       #LATIN SMALL LETTER A WITH TILDE
"\x8C"=>"&#xE5;",       #LATIN SMALL LETTER A WITH RING ABOVE
"\x8D"=>"&#xE7;",       #LATIN SMALL LETTER C WITH CEDILLA
"\x8E"=>"&#xE9;",       #LATIN SMALL LETTER E WITH ACUTE
"\x8F"=>"&#xE8;",       #LATIN SMALL LETTER E WITH GRAVE
"\x90"=>"&#xEA;",       #LATIN SMALL LETTER E WITH CIRCUMFLEX
"\x91"=>"&#xEB;",       #LATIN SMALL LETTER E WITH DIAERESIS
"\x92"=>"&#xED;",       #LATIN SMALL LETTER I WITH ACUTE
"\x93"=>"&#xEC;",       #LATIN SMALL LETTER I WITH GRAVE
"\x94"=>"&#xEE;",       #LATIN SMALL LETTER I WITH CIRCUMFLEX
"\x95"=>"&#xEF;",       #LATIN SMALL LETTER I WITH DIAERESIS
"\x96"=>"&#xF1;",       #LATIN SMALL LETTER N WITH TILDE
"\x97"=>"&#xF3;",       #LATIN SMALL LETTER O WITH ACUTE
"\x98"=>"&#xF2;",       #LATIN SMALL LETTER O WITH GRAVE
"\x99"=>"&#xF4;",       #LATIN SMALL LETTER O WITH CIRCUMFLEX
"\x9A"=>"&#xF6;",       #LATIN SMALL LETTER O WITH DIAERESIS
"\x9B"=>"&#xF5;",       #LATIN SMALL LETTER O WITH TILDE
"\x9C"=>"&#xFA;",       #LATIN SMALL LETTER U WITH ACUTE
"\x9D"=>"&#xF9;",       #LATIN SMALL LETTER U WITH GRAVE
"\x9E"=>"&#xFB;",       #LATIN SMALL LETTER U WITH CIRCUMFLEX
"\x9F"=>"&#xFC;",       #LATIN SMALL LETTER U WITH DIAERESIS
"\xA0"=>"&#x86;",       #DAGGER
"\xA1"=>"&#xB0;",       #DEGREE SIGN
"\xA2"=>"&#xA2;",       #CENT SIGN
"\xA3"=>"&#xA3;",       #POUND SIGN
"\xA4"=>"&#xA7;",       #SECTION SIGN
"\xA5"=>"&#x95;",       #BULLET
"\xA6"=>"&#xB6;",       #PILCROW SIGN
"\xA7"=>"&#xDF;",       #LATIN SMALL LETTER SHARP S
"\xA8"=>"&#xAE;",       #REGISTERED SIGN
"\xA9"=>"&#xA9;",       #COPYRIGHT SIGN
"\xAA"=>"&#x99;",       #TRADE MARK SIGN
"\xAB"=>"&#xB4;",       #ACUTE ACCENT
"\xAC"=>"&#xA8;",       #DIAERESIS
"\xAD"=>"&#x2260;",     #NOT EQUAL TO
"\xAE"=>"&#xC6;",       #LATIN CAPITAL LIGATURE AE
"\xAF"=>"&#xD8;",       #LATIN CAPITAL LETTER O WITH STROKE
"\xB0"=>"&#x221E;",     #INFINITY
"\xB1"=>"&#xB1;",       #PLUS-MINUS SIGN
"\xB2"=>"&#x2264;",     #LESS-THAN OR EQUAL TO
"\xB3"=>"&#x2265;",     #GREATER-THAN OR EQUAL TO
"\xB4"=>"&#xA5;",       #YEN SIGN
"\xB5"=>"&#xB5;",       #MICRO SIGN
"\xB6"=>"&#x2202;",     #PARTIAL DIFFERENTIAL
"\xB7"=>"&#x2211;",     #N-ARY SUMMATION
"\xB8"=>"&#x220F;",     #N-ARY PRODUCT
"\xB9"=>"&#x03C0;",     #GREEK SMALL LETTER PI
"\xBA"=>"&#x222B;",     #INTEGRAL
"\xBB"=>"&#xAA;",       #FEMININE ORDINAL INDICATOR
"\xBC"=>"&#xBA;",       #MASCULINE ORDINAL INDICATOR
"\xBD"=>"&#x2126;",     #OHM SIGN
"\xBE"=>"&#xE6;",       #LATIN SMALL LIGATURE AE
"\xBF"=>"&#xF8;",       #LATIN SMALL LETTER O WITH STROKE
"\xC0"=>"&#xBF;",       #INVERTED QUESTION MARK
"\xC1"=>"&#xA1;",       #INVERTED EXCLAMATION MARK
"\xC2"=>"&#xAC;",       #NOT SIGN
"\xC3"=>"&#x221A;",     #SQUARE ROOT
"\xC4"=>"&#x83;",       #LATIN SMALL LETTER F WITH HOOK
"\xC5"=>"&#x2248;",     #ALMOST EQUAL TO
"\xC6"=>"&#x2206;",     #INCREMENT
"\xC7"=>"&#xAB;",       #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
"\xC8"=>"&#xBB;",       #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
"\xC9"=>"&#x85;",       #HORIZONTAL ELLIPSIS
"\xCA"=>"&#xA0;",       #NO-BREAK SPACE
"\xCB"=>"&#xC0;",       #LATIN CAPITAL LETTER A WITH GRAVE
"\xCC"=>"&#xC3;",       #LATIN CAPITAL LETTER A WITH TILDE
"\xCD"=>"&#xD5;",       #LATIN CAPITAL LETTER O WITH TILDE
"\xCE"=>"&#x8C;",       #LATIN CAPITAL LIGATURE OE
"\xCF"=>"&#x9C;",       #LATIN SMALL LIGATURE OE
"\xD0"=>"&#x96;",       #EN DASH
"\xD1"=>"&#x97;",       #EM DASH
"\xD2"=>"&#x93;",       #LEFT DOUBLE QUOTATION MARK
"\xD3"=>"&#x94;",       #RIGHT DOUBLE QUOTATION MARK
"\xD4"=>"&#x91;",       #LEFT SINGLE QUOTATION MARK
"\xD5"=>"&#x92;",       #RIGHT SINGLE QUOTATION MARK
"\xD6"=>"&#xF7;",       #DIVISION SIGN
"\xD7"=>"&#x25CA;",     #LOZENGE
"\xD8"=>"&#xFF;",       #LATIN SMALL LETTER Y WITH DIAERESIS
"\xD9"=>"&#x9F;",       #LATIN CAPITAL LETTER Y WITH DIAERESIS
"\xDA"=>"&#x2044;",     #FRACTION SLASH
"\xDB"=>"&#x80;",       # EURO SIGN
"\xDC"=>"&#x8B;",       #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
"\xDD"=>"&#x9B;",       #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
"\xDE"=>"&#xFB01;",     #LATIN SMALL LIGATURE FI
"\xDF"=>"&#xFB02;",     #LATIN SMALL LIGATURE FL
"\xE0"=>"&#x2021;",     #DOUBLE DAGGER
"\xE1"=>"&#xB7;",       #MIDDLE DOT
"\xE2"=>"&#x82;",       #SINGLE LOW-9 QUOTATION MARK
"\xE3"=>"&#x84;",       #DOUBLE LOW-9 QUOTATION MARK
"\xE4"=>"&#x2030;",     #PER MILLE SIGN
"\xE5"=>"&#xC2;",       #LATIN CAPITAL LETTER A WITH CIRCUMFLEX
"\xE6"=>"&#xCA;",       #LATIN CAPITAL LETTER E WITH CIRCUMFLEX
"\xE7"=>"&#xC1;",       #LATIN CAPITAL LETTER A WITH ACUTE
"\xE8"=>"&#xCB;",       #LATIN CAPITAL LETTER E WITH DIAERESIS
"\xE9"=>"&#xC8;",       #LATIN CAPITAL LETTER E WITH GRAVE
"\xEA"=>"&#xCD;",       #LATIN CAPITAL LETTER I WITH ACUTE
"\xEB"=>"&#xCE;",       #LATIN CAPITAL LETTER I WITH CIRCUMFLEX
"\xEC"=>"&#xCF;",       #LATIN CAPITAL LETTER I WITH DIAERESIS
"\xED"=>"&#xCC;",       #LATIN CAPITAL LETTER I WITH GRAVE
"\xEE"=>"&#xD3;",       #LATIN CAPITAL LETTER O WITH ACUTE
"\xEF"=>"&#xD4;",       #LATIN CAPITAL LETTER O WITH CIRCUMFLEX
"\xF0"=>"&#xF8FF;",     # Apple logo
"\xF1"=>"&#xD2;",       #LATIN CAPITAL LETTER O WITH GRAVE
"\xF2"=>"&#xDA;",       #LATIN CAPITAL LETTER U WITH ACUTE
"\xF3"=>"&#xDB;",       #LATIN CAPITAL LETTER U WITH CIRCUMFLEX
"\xF4"=>"&#xD9;",       #LATIN CAPITAL LETTER U WITH GRAVE
"\xF5"=>"&#x0131;",     #LATIN SMALL LETTER DOTLESS I
"\xF6"=>"&#x88;",       #MODIFIER LETTER CIRCUMFLEX ACCENT
"\xF7"=>"&#x98;",       #SMALL TILDE
"\xF8"=>"&#xAF;",       #MACRON
"\xF9"=>"&#x02D8;",     #BREVE
"\xFA"=>"&#x02D9;",     #DOT ABOVE
"\xFB"=>"&#x02DA;",     #RING ABOVE
"\xFC"=>"&#xB8;",       #CEDILLA
"\xFD"=>"&#x02DD;",     #DOUBLE ACUTE ACCENT
"\xFE"=>"&#x02DB;",     #OGONEK
"\xFF"=>"&#x02C7;",     #CARON
);
####
$_ = "³j¹ai fêté...²\n";
 s~([\x80-\xFF])~$macToWin32HTML{$1}~g;
print;


Reply via email to