Author: mikemccand
Date: Fri Oct 26 21:06:27 2012
New Revision: 1402665

URL: http://svn.apache.org/viewvc?rev=1402665&view=rev
Log:
TIKA-1011: fix NPE when charset isn't recognized in .mhtml files

Added:
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testUserDefinedCharset.mhtml
   (with props)
Modified:
    tika/trunk/CHANGES.txt
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1402665&r1=1402664&r2=1402665&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Fri Oct 26 21:06:27 2012
@@ -38,6 +38,9 @@ Release 1.3 - Current Development
     key, and TikaCLI prepends the rId (if present) onto the filename
     it extracts (TIKA-997).
 
+  * MHTML: fixed Null charset name exception when a mime part has an
+    unrecognized charset (TIKA-1011).
+
 Release 1.2 - 07/10/2012
 ---------------------------------
 

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java?rev=1402665&r1=1402664&r2=1402665&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
 Fri Oct 26 21:06:27 2012
@@ -46,7 +46,12 @@ public class Icu4jEncodingDetector imple
         }
 
         if (incomingCharset != null) {
-            detector.setDeclaredEncoding(CharsetUtils.clean(incomingCharset));
+            String cleaned = CharsetUtils.clean(incomingCharset);
+            if (cleaned != null) {
+                detector.setDeclaredEncoding(cleaned);
+            } else {
+                // TODO: log a warning?
+            }
         }
 
         // TIKA-341 without enabling input filtering (stripping of tags)

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=1402665&r1=1402664&r2=1402665&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 Fri Oct 26 21:06:27 2012
@@ -801,4 +801,11 @@ public class HtmlParserTest extends Test
         assertEquals("some description", metadata.get("og:description"));
 
     }
+
+    // TIKA-1011
+    public void testUserDefinedCharset() throws Exception {
+        String content = new Tika().parseToString(
+                
HtmlParserTest.class.getResourceAsStream("/test-documents/testUserDefinedCharset.mhtml"),
 new Metadata());
+        assertNotNull(content);
+    }
 }

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testUserDefinedCharset.mhtml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testUserDefinedCharset.mhtml?rev=1402665&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/resources/test-documents/testUserDefinedCharset.mhtml
 (added)
+++ 
tika/trunk/tika-parsers/src/test/resources/test-documents/testUserDefinedCharset.mhtml
 Fri Oct 26 21:06:27 2012
@@ -0,0 +1,21 @@
+From: <Saved by Windows Internet Explorer 8>
+Subject: Index Pages
+Date: Tue, 28 Aug 2012 09:53:28 +0300
+MIME-Version: 1.0
+Content-Type: multipart/related;
+       type="multipart/alternative";
+       boundary="----=_NextPart_000_0000_01CD8502.F991E790"
+X-MimeOLE: Produced By Microsoft MimeOLE V6.00.2900.6157
+
+This is a multi-part message in MIME format.
+
+------=_NextPart_000_0000_01CD8502.F991E790
+Content-Type: multipart/alternative;
+       boundary="----=_NextPart_001_0023_01CD8502.F99DCE70"
+
+
+------=_NextPart_001_0023_01CD8502.F99DCE70
+Content-Type: text/html;
+       charset="x-user-defined"
+Content-Transfer-Encoding: quoted-printable
+

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testUserDefinedCharset.mhtml
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to