Author: mikemccand
Date: Fri Oct 26 21:06:27 2012
New Revision: 1402665
URL: http://svn.apache.org/viewvc?rev=1402665&view=rev
Log:
TIKA-1011: fix NPE when charset isn't recognized in .mhtml files
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testUserDefinedCharset.mhtml
(with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1402665&r1=1402664&r2=1402665&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Fri Oct 26 21:06:27 2012
@@ -38,6 +38,9 @@ Release 1.3 - Current Development
key, and TikaCLI prepends the rId (if present) onto the filename
it extracts (TIKA-997).
+ * MHTML: fixed Null charset name exception when a mime part has an
+ unrecognized charset (TIKA-1011).
+
Release 1.2 - 07/10/2012
---------------------------------
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java?rev=1402665&r1=1402664&r2=1402665&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
Fri Oct 26 21:06:27 2012
@@ -46,7 +46,12 @@ public class Icu4jEncodingDetector imple
}
if (incomingCharset != null) {
- detector.setDeclaredEncoding(CharsetUtils.clean(incomingCharset));
+ String cleaned = CharsetUtils.clean(incomingCharset);
+ if (cleaned != null) {
+ detector.setDeclaredEncoding(cleaned);
+ } else {
+ // TODO: log a warning?
+ }
}
// TIKA-341 without enabling input filtering (stripping of tags)
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=1402665&r1=1402664&r2=1402665&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Fri Oct 26 21:06:27 2012
@@ -801,4 +801,11 @@ public class HtmlParserTest extends Test
assertEquals("some description", metadata.get("og:description"));
}
+
+ // TIKA-1011
+ public void testUserDefinedCharset() throws Exception {
+ String content = new Tika().parseToString(
+
HtmlParserTest.class.getResourceAsStream("/test-documents/testUserDefinedCharset.mhtml"),
new Metadata());
+ assertNotNull(content);
+ }
}
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testUserDefinedCharset.mhtml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testUserDefinedCharset.mhtml?rev=1402665&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/test/resources/test-documents/testUserDefinedCharset.mhtml
(added)
+++
tika/trunk/tika-parsers/src/test/resources/test-documents/testUserDefinedCharset.mhtml
Fri Oct 26 21:06:27 2012
@@ -0,0 +1,21 @@
+From: <Saved by Windows Internet Explorer 8>
+Subject: Index Pages
+Date: Tue, 28 Aug 2012 09:53:28 +0300
+MIME-Version: 1.0
+Content-Type: multipart/related;
+ type="multipart/alternative";
+ boundary="----=_NextPart_000_0000_01CD8502.F991E790"
+X-MimeOLE: Produced By Microsoft MimeOLE V6.00.2900.6157
+
+This is a multi-part message in MIME format.
+
+------=_NextPart_000_0000_01CD8502.F991E790
+Content-Type: multipart/alternative;
+ boundary="----=_NextPart_001_0023_01CD8502.F99DCE70"
+
+
+------=_NextPart_001_0023_01CD8502.F99DCE70
+Content-Type: text/html;
+ charset="x-user-defined"
+Content-Transfer-Encoding: quoted-printable
+
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testUserDefinedCharset.mhtml
------------------------------------------------------------------------------
svn:eol-style = native