Author: awiner
Date: Fri Apr 17 21:56:17 2009
New Revision: 766165

URL: http://svn.apache.org/viewvc?rev=766165&view=rev
Log:
Handle unsupported or invalidly named charsets in Content-Type headers by 
falling back as if the encoding was unspecified.

Modified:
    
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java
    
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java
    
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java
    
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java

Modified: 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java
URL: 
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java?rev=766165&r1=766164&r2=766165&view=diff
==============================================================================
--- 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java
 (original)
+++ 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java
 Fri Apr 17 21:56:17 2009
@@ -18,6 +18,8 @@
  */
 package org.apache.shindig.gadgets.encoding;
 
+import java.nio.charset.Charset;
+
 import com.ibm.icu.text.CharsetDetector;
 import com.ibm.icu.text.CharsetMatch;
 
@@ -27,6 +29,8 @@
  * Highly skewed towards common encodings (UTF-8 and Latin-1).
  */
 public class EncodingDetector {
+  private static final Charset UTF_8 = Charset.forName("UTF-8");
+  private static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
 
   /**
    * Returns the detected encoding of the given byte array.
@@ -38,20 +42,20 @@
    *     expensive!
    * @return The detected encoding.
    */
-  public static String detectEncoding(byte[] input, boolean 
assume88591IfNotUtf8) {
+  public static Charset detectEncoding(byte[] input, boolean 
assume88591IfNotUtf8) {
     if (looksLikeValidUtf8(input)) {
-      return "UTF-8";
+      return UTF_8;
     }
 
     if (assume88591IfNotUtf8) {
-      return "ISO-8859-1";
+      return ISO_8859_1;
     }
 
     // Fall back to the incredibly slow ICU. It might be better to just skip 
this entirely.
     CharsetDetector detector = new CharsetDetector();
     detector.setText(input);
     CharsetMatch match = detector.detect();
-    return match.getName().toUpperCase();
+    return Charset.forName(match.getName().toUpperCase());
   }
 
   /**

Modified: 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java?rev=766165&r1=766164&r2=766165&view=diff
==============================================================================
--- 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java
 (original)
+++ 
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java
 Fri Apr 17 21:56:17 2009
@@ -122,7 +122,7 @@
   // Default TTL for an entry in the cache that does not have any cache 
control headers.
   static final long DEFAULT_TTL = 5L * 60L * 1000L;
 
-  static final String DEFAULT_ENCODING = "UTF-8";
+  static final Charset DEFAULT_ENCODING = Charset.forName("UTF-8");
 
   @Inject(optional = true) @Named("shindig.cache.http.negativeCacheTtl")
   private static long negativeCacheTtl = DEFAULT_NEGATIVE_CACHE_TTL;
@@ -138,7 +138,7 @@
 
   private transient String responseString;
   private transient long date;
-  private transient String encoding;
+  private transient Charset encoding;
   private transient Map<String, String> metadata;
 
   private int httpStatusCode;
@@ -208,7 +208,7 @@
    * @return The encoding of the response body, if we're able to determine it.
    */
   public String getEncoding() {
-    return encoding;
+    return encoding.name();
   }
 
   /**
@@ -234,12 +234,7 @@
    */
   public String getResponseAsString() {
     if (responseString == null) {
-      Charset charset = encodingToCharset.get(encoding);
-      if (charset == null) {
-        charset = Charset.forName(encoding);
-        encodingToCharset.put(encoding, charset);
-      }
-      responseString = 
charset.decode(ByteBuffer.wrap(responseBytes)).toString();
+      responseString = 
encoding.decode(ByteBuffer.wrap(responseBytes)).toString();
 
       // Strip BOM if present
       if (responseString.length() > 0 && responseString.codePointAt(0) == 
0xFEFF) {
@@ -424,7 +419,7 @@
    *
    * @return The detected encoding or DEFAULT_ENCODING.
    */
-  private static String getAndUpdateEncoding(Multimap<String, String> headers, 
byte[] body) {
+  private static Charset getAndUpdateEncoding(Multimap<String, String> 
headers, byte[] body) {
     if (body == null || body.length == 0) {
       return DEFAULT_ENCODING;
     }
@@ -445,14 +440,19 @@
           if (charset.charAt(0) == '"') {
             charset = charset.substring(1, charset.length() - 1);
           }
-          return charset;
+          
+          try {
+            return charsetForName(charset);
+          } catch (IllegalArgumentException e) {
+            // fall through to detection
+          }
         }
       }
-      String encoding = EncodingDetector.detectEncoding(body, 
fastEncodingDetection);
+      Charset encoding = EncodingDetector.detectEncoding(body, 
fastEncodingDetection);
       // Record the charset in the content-type header so that its value can 
be cached
       // and re-used. This is a BIG performance win.
       values.clear();
-      values.add(contentType + "; charset=" + encoding);
+      values.add(contentType + "; charset=" + encoding.name());
 
       return encoding;
     } else {
@@ -461,6 +461,21 @@
     }
   }
 
+  /**
+   * Cover for Charset.forName() that caches results.
+   * @return the charset
+   * @throws IllegalArgumentException if the encoding is invalid
+   */
+  private static Charset charsetForName(String encoding) {
+    Charset charset = encodingToCharset.get(encoding);
+    if (charset == null) {
+      charset = Charset.forName(encoding);
+      encodingToCharset.put(encoding, charset);
+    }
+    
+    return charset;
+  }
+  
   @Override
   public boolean equals(Object obj) {
     if (obj == this) { return true; }

Modified: 
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java
URL: 
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java?rev=766165&r1=766164&r2=766165&view=diff
==============================================================================
--- 
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java
 (original)
+++ 
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java
 Fri Apr 17 21:56:17 2009
@@ -27,7 +27,7 @@
   @Test
   public void asciiAssumesUtf8() throws Exception {
     byte[] data = "Hello, world".getBytes("US-ASCII");
-    assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true));
+    assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true).name());
   }
 
   @Test
@@ -36,14 +36,14 @@
         (byte)0xEF, (byte)0xBB, (byte)0xBF, 'h', 'e', 'l', 'l', 'o'
     };
 
-    assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true));
+    assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true).name());
   }
 
   @Test
   public void assumeLatin1OnInvalidUtf8() throws Exception {
     byte[] data = "\u4F60\u597D".getBytes("BIG5");
 
-    assertEquals("ISO-8859-1", EncodingDetector.detectEncoding(data, true));
+    assertEquals("ISO-8859-1", EncodingDetector.detectEncoding(data, 
true).name());
   }
 
   @Test
@@ -53,7 +53,7 @@
                    
"\u8FBE\u4E0D\u51FA\uFF0C\u6709\u611F\u60C5\u65E0\u6CD5\u503E\u5410")
                    .getBytes("GB18030");
 
-    assertEquals("GB18030", EncodingDetector.detectEncoding(data, false));
+    assertEquals("GB18030", EncodingDetector.detectEncoding(data, 
false).name());
   }
 
   @Test
@@ -63,13 +63,13 @@
                    
"\u8FBE\u4E0D\u51FA\uFF0C\u6709\u611F\u60C5\u65E0\u6CD5\u503E\u5410")
                    .getBytes("UTF-8");
 
-    assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true));
+    assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true).name());
   }
 
   @Test
   public void shortUtf8StringIsUtf8() throws Exception {
     byte[] data = "Games, HQ, Mang\u00E1, Anime e tudo que um bom nerd 
ama".getBytes("UTF-8");
 
-    assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true));
+    assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true).name());
   }
 }

Modified: 
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java
URL: 
http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java?rev=766165&r1=766164&r2=766165&view=diff
==============================================================================
--- 
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java
 (original)
+++ 
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/http/HttpResponseTest.java
 Fri Apr 17 21:56:17 2009
@@ -62,40 +62,55 @@
   }
 
   @Test
-  public void testGetEncoding() throws Exception {
+  public void testEncodingDetectionUtf8WithBom() throws Exception {
+     HttpResponse response = new HttpResponseBuilder()
+         .addHeader("Content-Type", "text/plain; charset=UTF-8")
+         .setResponse(UTF8_DATA)
+         .create();
+    assertEquals(UTF8_STRING, response.getResponseAsString());
+    assertEquals("UTF-8", response.getEncoding());
+  }
+
+  @Test
+  public void testEncodingDetectionLatin1() throws Exception {
+    // Input is a basic latin-1 string with 1 non-UTF8 compatible char.
     HttpResponse response = new HttpResponseBuilder()
-        .addHeader("Content-Type", "text/plain; charset=TEST-CHARACTER-SET")
-        .setResponse(new byte[] {'j', 'u', 'n', 'k'})
+        .addHeader("Content-Type", "text/plain; charset=iso-8859-1")
+        .setResponse(LATIN1_DATA)
         .create();
-    assertEquals("TEST-CHARACTER-SET", response.getEncoding());
+    assertEquals(LATIN1_STRING, response.getResponseAsString());
   }
 
   @Test
-  public void testGetEncodingQuotes() throws Exception {
+  public void testEncodingDetectionLatin1withIncorrectCharset() throws 
Exception {
+    // Input is a basic latin-1 string with 1 non-UTF8 compatible char.
     HttpResponse response = new HttpResponseBuilder()
-        .addHeader("Content-Type", "text/plain; 
charset=\"TEST-CHARACTER-SET\"")
-        .setResponse(new byte[] {'j', 'u', 'n', 'k'})
+        .addHeader("Content-Type", "text/plain; charset=iso-88859-1")
+        .setResponse(LATIN1_DATA)
         .create();
-    assertEquals("TEST-CHARACTER-SET", response.getEncoding());
+    assertEquals(LATIN1_STRING, response.getResponseAsString());
+    assertEquals("ISO-8859-1", response.getEncoding());
   }
 
   @Test
-  public void testEncodingDetectionUtf8WithBom() throws Exception {
+  public void testEncodingDetectionUtf8WithBomAndIncorrectCharset() throws 
Exception {
      HttpResponse response = new HttpResponseBuilder()
-         .addHeader("Content-Type", "text/plain; charset=UTF-8")
+         .addHeader("Content-Type", "text/plain; charset=UTTFF-88")
          .setResponse(UTF8_DATA)
          .create();
     assertEquals(UTF8_STRING, response.getResponseAsString());
+    assertEquals("UTF-8", response.getEncoding());
   }
 
   @Test
-  public void testEncodingDetectionLatin1() throws Exception {
-    // Input is a basic latin-1 string with 1 non-UTF8 compatible char.
-    HttpResponse response = new HttpResponseBuilder()
-        .addHeader("Content-Type", "text/plain; charset=iso-8859-1")
-        .setResponse(LATIN1_DATA)
-        .create();
-    assertEquals(LATIN1_STRING, response.getResponseAsString());
+  public void testEncodingDetectionUtf8WithBomAndInvalidCharset() throws 
Exception {
+     HttpResponse response = new HttpResponseBuilder()
+         // Use a charset that will generate an IllegalCharsetNameException
+         .addHeader("Content-Type", "text/plain; charset=.UTF-8")
+         .setResponse(UTF8_DATA)
+         .create();
+    assertEquals(UTF8_STRING, response.getResponseAsString());
+    assertEquals("UTF-8", response.getEncoding());
   }
 
   @Test
@@ -108,6 +123,15 @@
   }
 
   @Test
+  public void testEncodingDetectionBig5WithQuotes() throws Exception {
+    HttpResponse response = new HttpResponseBuilder()
+        .addHeader("Content-Type", "text/plain; charset=\"BIG5\"")
+        .setResponse(BIG5_DATA)
+        .create();
+    assertEquals(BIG5_STRING, response.getResponseAsString());
+  }
+
+  @Test
   public void testEncodingDetectionUtf8WithBomNoCharsetSpecified() throws 
Exception {
     HttpResponse response = new HttpResponseBuilder()
         .addHeader("Content-Type", "text/plain")
@@ -141,7 +165,7 @@
      HttpResponse response = new HttpResponseBuilder()
         .setResponse(LATIN1_DATA)
         .create();
-    assertEquals(HttpResponse.DEFAULT_ENCODING, response.getEncoding());
+    assertEquals(HttpResponse.DEFAULT_ENCODING.name(), response.getEncoding());
   }
 
   @Test
@@ -150,7 +174,7 @@
         .setResponse(LATIN1_DATA)
         .addHeader("Content-Type", "image/png; charset=iso-8859-1")
         .create();
-    assertEquals(HttpResponse.DEFAULT_ENCODING, 
response.getEncoding().toUpperCase());
+    assertEquals(HttpResponse.DEFAULT_ENCODING.name(), 
response.getEncoding().toUpperCase());
   }
 
   @Test
@@ -159,7 +183,7 @@
         .setResponse(LATIN1_DATA)
         .addHeader("Content-Type", "application/x-shockwave-flash; 
charset=iso-8859-1")
         .create();
-    assertEquals(HttpResponse.DEFAULT_ENCODING, 
response.getEncoding().toUpperCase());
+    assertEquals(HttpResponse.DEFAULT_ENCODING.name(), 
response.getEncoding().toUpperCase());
   }
 
   @Test


Reply via email to