Hello
We had the same problem of determining the encoding of an xml-document
in our company a year ago. Because we couldn't find any code on the
net that served this purpose we decided to implement a "hack" by ourselves.
Because it seems that the time hasn't changed, I decided to post this code
here. I hope it will be useful (at least as a temporary solution).
Bye
Alain Schneble
Realize IT GmbH
CH - 8640 Rapperswil
so here is the hack:
(it could be that a part of the getEncodingName method comes from
a version of the xerces sources, but I'm not 100% sure anymore...)
public final class CharsetFetcher
{
public static Charset getEncoding(URL urlXML)
{
/*
* let's get the (physical) encoding
*/
BufferedInputStream bis = new BufferedInputStream(urlXML.openStream());
byte[] b = new byte[4];
bis.read(b);
bis.close();
Object[] encoding = getEncodingName(b, 4);
char[] cbuf = new char[128];
InputStreamReader isr;
/*
* this is a hack
* we need to support only UTF-8 or UTF-16 files
*/
if (encoding[0] == "UTF-8")
{
isr = new InputStreamReader(urlXML.openStream(), "UTF-8");
}
else
{
isr = new InputStreamReader(urlXML.openStream(), "UTF-16");
}
isr.read(cbuf, 0, 128);
isr.close();
/*
* ok, now read what's really inside the encoding attribute
* defined in the xml file
*/
String s = new String(cbuf);
s = s.toUpperCase();
int nEnc = s.indexOf("ENCODING") + "ENCODING".length();
int nQuotBegin = s.indexOf("\"", nEnc) + 1;
int nQuotEnd = s.indexOf("\"", nQuotBegin);
s = s.substring(nQuotBegin, nQuotEnd);
/*
* the encoding name will now become our "Working-Charset"
* used mainly for saving the file back with the same encoding type
* as the original file
*/
return Charset.forName(s);
}
/*
* Returns the (physical) Encoding name that could be determined based
* on the byte representations of the characters in the first four bytes
* of a text file
*/
private static Object[] getEncodingName(byte[] b4, int count)
{
if (count < 2)
{
return new Object[]{"UTF-8", null};
}
// UTF-16, with BOM
int b0 = b4[0] & 0xFF;
int b1 = b4[1] & 0xFF;
if (b0 == 0xFE && b1 == 0xFF)
{
// UTF-16, big-endian
return new Object [] {"UTF-16BE", new Boolean(true)};
}
if (b0 == 0xFF && b1 == 0xFE)
{
// UTF-16, little-endian
return new Object [] {"UTF-16LE", new Boolean(false)};
}
// default to UTF-8 if we don't have enough bytes to make a
// good determination of the encoding
if (count < 3)
{
return new Object [] {"UTF-8", null};
}
// UTF-8 with a BOM
int b2 = b4[2] & 0xFF;
if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF)
{
return new Object [] {"UTF-8", null};
}
// default to UTF-8 if we don't have enough bytes to make a
// good determination of the encoding
if (count < 4)
{
return new Object [] {"UTF-8", null};
}
// other encodings
int b3 = b4[3] & 0xFF;
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C)
{
// UCS-4, big endian (1234)
return new Object [] {"ISO-10646-UCS-4", new Boolean(true)};
}
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00)
{
// UCS-4, little endian (4321)
return new Object [] {"ISO-10646-UCS-4", new Boolean(false)};
}
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00)
{
// UCS-4, unusual octet order (2143)
// REVISIT: What should this be?
return new Object [] {"ISO-10646-UCS-4", null};
}
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00)
{
// UCS-4, unusual octect order (3412)
// REVISIT: What should this be?
return new Object [] {"ISO-10646-UCS-4", null};
}
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F)
{
// UTF-16, big-endian, no BOM
// (or could turn out to be UCS-2...
// REVISIT: What should this be?
return new Object [] {"UTF-16BE", new Boolean(true)};
}
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00)
{
// UTF-16, little-endian, no BOM
// (or could turn out to be UCS-2...
return new Object [] {"UTF-16LE", new Boolean(false)};
}
if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94)
{
// EBCDIC
// return CP037 instead of EBCDIC here
return new Object [] {"CP037", null};
}
// default encoding
return new Object [] {"UTF-8", null};
}
}
+wzf¢+,¦ìo$
ëF÷iÉ®+j`èzĺ·àzw
éiÊzj+zèvç-÷¡ë"¶¥§*.r·µæÞ×r^Hp1Ä,åDjwazZn²¥¥
Xw«zm§ÿì¢êÜyú+éÞ÷h«^ýÚ&â;¬z¹X§X¬µÚ&â;¬z¹b²Û,¢êÜyú+éÞm¦Ïÿ+-²Ê.Ç¢¸ë+-³ùb²Ø~Ýn#ºÇ