Hello We had the same problem of determining the encoding of an xml-document in our company a year ago. Because we couldn't find any code on the net that served this purpose we decided to implement a "hack" by ourselves. Because it seems that the time hasn't changed, I decided to post this code here. I hope it will be useful (at least as a temporary solution).
Bye Alain Schneble Realize IT GmbH CH - 8640 Rapperswil so here is the hack: (it could be that a part of the getEncodingName method comes from a version of the xerces sources, but I'm not 100% sure anymore...) public final class CharsetFetcher { public static Charset getEncoding(URL urlXML) { /* * let's get the (physical) encoding */ BufferedInputStream bis = new BufferedInputStream(urlXML.openStream()); byte[] b = new byte[4]; bis.read(b); bis.close(); Object[] encoding = getEncodingName(b, 4); char[] cbuf = new char[128]; InputStreamReader isr; /* * this is a hack * we need to support only UTF-8 or UTF-16 files */ if (encoding[0] == "UTF-8") { isr = new InputStreamReader(urlXML.openStream(), "UTF-8"); } else { isr = new InputStreamReader(urlXML.openStream(), "UTF-16"); } isr.read(cbuf, 0, 128); isr.close(); /* * ok, now read what's really inside the encoding attribute * defined in the xml file */ String s = new String(cbuf); s = s.toUpperCase(); int nEnc = s.indexOf("ENCODING") + "ENCODING".length(); int nQuotBegin = s.indexOf("\"", nEnc) + 1; int nQuotEnd = s.indexOf("\"", nQuotBegin); s = s.substring(nQuotBegin, nQuotEnd); /* * the encoding name will now become our "Working-Charset" * used mainly for saving the file back with the same encoding type * as the original file */ return Charset.forName(s); } /* * Returns the (physical) Encoding name that could be determined based * on the byte representations of the characters in the first four bytes * of a text file */ private static Object[] getEncodingName(byte[] b4, int count) { if (count < 2) { return new Object[]{"UTF-8", null}; } // UTF-16, with BOM int b0 = b4[0] & 0xFF; int b1 = b4[1] & 0xFF; if (b0 == 0xFE && b1 == 0xFF) { // UTF-16, big-endian return new Object [] {"UTF-16BE", new Boolean(true)}; } if (b0 == 0xFF && b1 == 0xFE) { // UTF-16, little-endian return new Object [] {"UTF-16LE", new Boolean(false)}; } // default to UTF-8 if we don't have enough bytes to make a // good determination of the encoding if (count < 3) { return new Object [] {"UTF-8", null}; } // UTF-8 with a BOM int b2 = b4[2] & 0xFF; if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { return new Object [] {"UTF-8", null}; } // default to UTF-8 if we don't have enough bytes to make a // good determination of the encoding if (count < 4) { return new Object [] {"UTF-8", null}; } // other encodings int b3 = b4[3] & 0xFF; if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { // UCS-4, big endian (1234) return new Object [] {"ISO-10646-UCS-4", new Boolean(true)}; } if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { // UCS-4, little endian (4321) return new Object [] {"ISO-10646-UCS-4", new Boolean(false)}; } if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { // UCS-4, unusual octet order (2143) // REVISIT: What should this be? return new Object [] {"ISO-10646-UCS-4", null}; } if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { // UCS-4, unusual octect order (3412) // REVISIT: What should this be? return new Object [] {"ISO-10646-UCS-4", null}; } if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { // UTF-16, big-endian, no BOM // (or could turn out to be UCS-2... // REVISIT: What should this be? return new Object [] {"UTF-16BE", new Boolean(true)}; } if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { // UTF-16, little-endian, no BOM // (or could turn out to be UCS-2... return new Object [] {"UTF-16LE", new Boolean(false)}; } if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { // EBCDIC // return CP037 instead of EBCDIC here return new Object [] {"CP037", null}; } // default encoding return new Object [] {"UTF-8", null}; } } +wzf¢+,¦ìo$ ëF÷iÉ®+j`èzĺ·àzw éiÊzj+zèvç-÷¡ë"¶¥§*.r·µæÞ×r^Hp1Ä,åDjwazZn²¥¥ Xw«zm§ÿì¢êÜyú+éÞ÷h«^ýÚ&â;¬z¹X§X¬µÚ&â;¬z¹b²Û,¢êÜyú+éÞm¦Ïÿ+-²Ê.Ç¢¸ë+-³ùb²Ø~Ýn#ºÇ