Author: awiner
Date: Fri Apr 17 21:56:17 2009
New Revision: 766165
URL: http://svn.apache.org/viewvc?rev=766165&view=rev
Log:
Handle unsupported or invalidly named charsets in Content-Type headers by
falling back as if the encoding was unspecified.
Modified:
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java
Modified:
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java
URL:
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java?rev=766165&r1=766164&r2=766165&view=diff
==============================================================================
---
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java
(original)
+++
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java
Fri Apr 17 21:56:17 2009
@@ -18,6 +18,8 @@
*/
package org.apache.shindig.gadgets.encoding;
+import java.nio.charset.Charset;
+
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
@@ -27,6 +29,8 @@
* Highly skewed towards common encodings (UTF-8 and Latin-1).
*/
public class EncodingDetector {
+ private static final Charset UTF_8 = Charset.forName("UTF-8");
+ private static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
/**
* Returns the detected encoding of the given byte array.
@@ -38,20 +42,20 @@
* expensive!
* @return The detected encoding.
*/
- public static String detectEncoding(byte[] input, boolean
assume88591IfNotUtf8) {
+ public static Charset detectEncoding(byte[] input, boolean
assume88591IfNotUtf8) {
if (looksLikeValidUtf8(input)) {
- return "UTF-8";
+ return UTF_8;
}
if (assume88591IfNotUtf8) {
- return "ISO-8859-1";
+ return ISO_8859_1;
}
// Fall back to the incredibly slow ICU. It might be better to just skip
this entirely.
CharsetDetector detector = new CharsetDetector();
detector.setText(input);
CharsetMatch match = detector.detect();
- return match.getName().toUpperCase();
+ return Charset.forName(match.getName().toUpperCase());
}
/**
Modified:
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java
URL:
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java?rev=766165&r1=766164&r2=766165&view=diff
==============================================================================
---
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java
(original)
+++
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java
Fri Apr 17 21:56:17 2009
@@ -122,7 +122,7 @@
// Default TTL for an entry in the cache that does not have any cache
control headers.
static final long DEFAULT_TTL = 5L * 60L * 1000L;
- static final String DEFAULT_ENCODING = "UTF-8";
+ static final Charset DEFAULT_ENCODING = Charset.forName("UTF-8");
@Inject(optional = true) @Named("shindig.cache.http.negativeCacheTtl")
private static long negativeCacheTtl = DEFAULT_NEGATIVE_CACHE_TTL;
@@ -138,7 +138,7 @@
private transient String responseString;
private transient long date;
- private transient String encoding;
+ private transient Charset encoding;
private transient Map<String, String> metadata;
private int httpStatusCode;
@@ -208,7 +208,7 @@
* @return The encoding of the response body, if we're able to determine it.
*/
public String getEncoding() {
- return encoding;
+ return encoding.name();
}
/**
@@ -234,12 +234,7 @@
*/
public String getResponseAsString() {
if (responseString == null) {
- Charset charset = encodingToCharset.get(encoding);
- if (charset == null) {
- charset = Charset.forName(encoding);
- encodingToCharset.put(encoding, charset);
- }
- responseString =
charset.decode(ByteBuffer.wrap(responseBytes)).toString();
+ responseString =
encoding.decode(ByteBuffer.wrap(responseBytes)).toString();
// Strip BOM if present
if (responseString.length() > 0 && responseString.codePointAt(0) ==
0xFEFF) {
@@ -424,7 +419,7 @@
*
* @return The detected encoding or DEFAULT_ENCODING.
*/
- private static String getAndUpdateEncoding(Multimap<String, String> headers,
byte[] body) {
+ private static Charset getAndUpdateEncoding(Multimap<String, String>
headers, byte[] body) {
if (body == null || body.length == 0) {
return DEFAULT_ENCODING;
}
@@ -445,14 +440,19 @@
if (charset.charAt(0) == '"') {
charset = charset.substring(1, charset.length() - 1);
}
- return charset;
+
+ try {
+ return charsetForName(charset);
+ } catch (IllegalArgumentException e) {
+ // fall through to detection
+ }
}
}
- String encoding = EncodingDetector.detectEncoding(body,
fastEncodingDetection);
+ Charset encoding = EncodingDetector.detectEncoding(body,
fastEncodingDetection);
// Record the charset in the content-type header so that its value can
be cached
// and re-used. This is a BIG performance win.
values.clear();
- values.add(contentType + "; charset=" + encoding);
+ values.add(contentType + "; charset=" + encoding.name());
return encoding;
} else {
@@ -461,6 +461,21 @@
}
}
+ /**
+ * Cover for Charset.forName() that caches results.
+ * @return the charset
+ * @throws IllegalArgumentException if the encoding is invalid
+ */
+ private static Charset charsetForName(String encoding) {
+ Charset charset = encodingToCharset.get(encoding);
+ if (charset == null) {
+ charset = Charset.forName(encoding);
+ encodingToCharset.put(encoding, charset);
+ }
+
+ return charset;
+ }
+
@Override
public boolean equals(Object obj) {
if (obj == this) { return true; }
Modified:
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java
URL:
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java?rev=766165&r1=766164&r2=766165&view=diff
==============================================================================
---
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java
(original)
+++
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java
Fri Apr 17 21:56:17 2009
@@ -27,7 +27,7 @@
@Test
public void asciiAssumesUtf8() throws Exception {
byte[] data = "Hello, world".getBytes("US-ASCII");
- assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true));
+ assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true).name());
}
@Test
@@ -36,14 +36,14 @@
(byte)0xEF, (byte)0xBB, (byte)0xBF, 'h', 'e', 'l', 'l', 'o'
};
- assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true));
+ assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true).name());
}
@Test
public void assumeLatin1OnInvalidUtf8() throws Exception {
byte[] data = "\u4F60\u597D".getBytes("BIG5");
- assertEquals("ISO-8859-1", EncodingDetector.detectEncoding(data, true));
+ assertEquals("ISO-8859-1", EncodingDetector.detectEncoding(data,
true).name());
}
@Test
@@ -53,7 +53,7 @@
"\u8FBE\u4E0D\u51FA\uFF0C\u6709\u611F\u60C5\u65E0\u6CD5\u503E\u5410")
.getBytes("GB18030");
- assertEquals("GB18030", EncodingDetector.detectEncoding(data, false));
+ assertEquals("GB18030", EncodingDetector.detectEncoding(data,
false).name());
}
@Test
@@ -63,13 +63,13 @@
"\u8FBE\u4E0D\u51FA\uFF0C\u6709\u611F\u60C5\u65E0\u6CD5\u503E\u5410")
.getBytes("UTF-8");
- assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true));
+ assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true).name());
}
@Test
public void shortUtf8StringIsUtf8() throws Exception {
byte[] data = "Games, HQ, Mang\u00E1, Anime e tudo que um bom nerd
ama".getBytes("UTF-8");
- assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true));
+ assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true).name());
}
}
Modified:
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java
URL:
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java?rev=766165&r1=766164&r2=766165&view=diff
==============================================================================
---
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java
(original)
+++
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java
Fri Apr 17 21:56:17 2009
@@ -62,40 +62,55 @@
}
@Test
- public void testGetEncoding() throws Exception {
+ public void testEncodingDetectionUtf8WithBom() throws Exception {
+ HttpResponse response = new HttpResponseBuilder()
+ .addHeader("Content-Type", "text/plain; charset=UTF-8")
+ .setResponse(UTF8_DATA)
+ .create();
+ assertEquals(UTF8_STRING, response.getResponseAsString());
+ assertEquals("UTF-8", response.getEncoding());
+ }
+
+ @Test
+ public void testEncodingDetectionLatin1() throws Exception {
+ // Input is a basic latin-1 string with 1 non-UTF8 compatible char.
HttpResponse response = new HttpResponseBuilder()
- .addHeader("Content-Type", "text/plain; charset=TEST-CHARACTER-SET")
- .setResponse(new byte[] {'j', 'u', 'n', 'k'})
+ .addHeader("Content-Type", "text/plain; charset=iso-8859-1")
+ .setResponse(LATIN1_DATA)
.create();
- assertEquals("TEST-CHARACTER-SET", response.getEncoding());
+ assertEquals(LATIN1_STRING, response.getResponseAsString());
}
@Test
- public void testGetEncodingQuotes() throws Exception {
+ public void testEncodingDetectionLatin1withIncorrectCharset() throws
Exception {
+ // Input is a basic latin-1 string with 1 non-UTF8 compatible char.
HttpResponse response = new HttpResponseBuilder()
- .addHeader("Content-Type", "text/plain;
charset=\"TEST-CHARACTER-SET\"")
- .setResponse(new byte[] {'j', 'u', 'n', 'k'})
+ .addHeader("Content-Type", "text/plain; charset=iso-88859-1")
+ .setResponse(LATIN1_DATA)
.create();
- assertEquals("TEST-CHARACTER-SET", response.getEncoding());
+ assertEquals(LATIN1_STRING, response.getResponseAsString());
+ assertEquals("ISO-8859-1", response.getEncoding());
}
@Test
- public void testEncodingDetectionUtf8WithBom() throws Exception {
+ public void testEncodingDetectionUtf8WithBomAndIncorrectCharset() throws
Exception {
HttpResponse response = new HttpResponseBuilder()
- .addHeader("Content-Type", "text/plain; charset=UTF-8")
+ .addHeader("Content-Type", "text/plain; charset=UTTFF-88")
.setResponse(UTF8_DATA)
.create();
assertEquals(UTF8_STRING, response.getResponseAsString());
+ assertEquals("UTF-8", response.getEncoding());
}
@Test
- public void testEncodingDetectionLatin1() throws Exception {
- // Input is a basic latin-1 string with 1 non-UTF8 compatible char.
- HttpResponse response = new HttpResponseBuilder()
- .addHeader("Content-Type", "text/plain; charset=iso-8859-1")
- .setResponse(LATIN1_DATA)
- .create();
- assertEquals(LATIN1_STRING, response.getResponseAsString());
+ public void testEncodingDetectionUtf8WithBomAndInvalidCharset() throws
Exception {
+ HttpResponse response = new HttpResponseBuilder()
+ // Use a charset that will generate an IllegalCharsetNameException
+ .addHeader("Content-Type", "text/plain; charset=.UTF-8")
+ .setResponse(UTF8_DATA)
+ .create();
+ assertEquals(UTF8_STRING, response.getResponseAsString());
+ assertEquals("UTF-8", response.getEncoding());
}
@Test
@@ -108,6 +123,15 @@
}
@Test
+ public void testEncodingDetectionBig5WithQuotes() throws Exception {
+ HttpResponse response = new HttpResponseBuilder()
+ .addHeader("Content-Type", "text/plain; charset=\"BIG5\"")
+ .setResponse(BIG5_DATA)
+ .create();
+ assertEquals(BIG5_STRING, response.getResponseAsString());
+ }
+
+ @Test
public void testEncodingDetectionUtf8WithBomNoCharsetSpecified() throws
Exception {
HttpResponse response = new HttpResponseBuilder()
.addHeader("Content-Type", "text/plain")
@@ -141,7 +165,7 @@
HttpResponse response = new HttpResponseBuilder()
.setResponse(LATIN1_DATA)
.create();
- assertEquals(HttpResponse.DEFAULT_ENCODING, response.getEncoding());
+ assertEquals(HttpResponse.DEFAULT_ENCODING.name(), response.getEncoding());
}
@Test
@@ -150,7 +174,7 @@
.setResponse(LATIN1_DATA)
.addHeader("Content-Type", "image/png; charset=iso-8859-1")
.create();
- assertEquals(HttpResponse.DEFAULT_ENCODING,
response.getEncoding().toUpperCase());
+ assertEquals(HttpResponse.DEFAULT_ENCODING.name(),
response.getEncoding().toUpperCase());
}
@Test
@@ -159,7 +183,7 @@
.setResponse(LATIN1_DATA)
.addHeader("Content-Type", "application/x-shockwave-flash;
charset=iso-8859-1")
.create();
- assertEquals(HttpResponse.DEFAULT_ENCODING,
response.getEncoding().toUpperCase());
+ assertEquals(HttpResponse.DEFAULT_ENCODING.name(),
response.getEncoding().toUpperCase());
}
@Test