* Maintain passed-in mime in TXTParser (TIKA-2047).
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/32d9ece8 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/32d9ece8 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/32d9ece8 Branch: refs/heads/2.x Commit: 32d9ece8d84986de240087a580e094de3f879f3c Parents: 12b1d43 Author: tballison <[email protected]> Authored: Wed Sep 21 15:51:02 2016 -0400 Committer: tballison <[email protected]> Committed: Wed Sep 21 15:51:02 2016 -0400 ---------------------------------------------------------------------- CHANGES.txt | 2 ++ .../main/java/org/apache/tika/parser/txt/TXTParser.java | 12 +++++++++++- .../java/org/apache/tika/parser/txt/TXTParserTest.java | 10 ++++++++-- 3 files changed, 21 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/32d9ece8/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index 662217d..46a5894 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -17,6 +17,8 @@ Release 2.0 - ??? Release 1.14 - ??? + * Maintain passed-in mime in TXTParser (TIKA-2047). + * Upgrade to POI 3.15-final (TIKA-2013). * Upgrade to PDFBox 2.0.3 (TIKA-2051). http://git-wip-us.apache.org/repos/asf/tika/blob/32d9ece8/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java index 2b20495..2e7bb19 100644 --- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java +++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java @@ -72,8 +72,18 @@ public class TXTParser extends AbstractParser { try (AutoDetectReader reader = new AutoDetectReader( new CloseShieldInputStream(stream), metadata, context.get(ServiceLoader.class, LOADER))) { + //try to get detected content type; could be a subclass of text/plain + //such as vcal, etc. + String incomingMime = metadata.get(Metadata.CONTENT_TYPE); + MediaType mediaType = MediaType.TEXT_PLAIN; + if (incomingMime != null) { + MediaType tmpMediaType = MediaType.parse(incomingMime); + if (tmpMediaType != null) { + mediaType = tmpMediaType; + } + } Charset charset = reader.getCharset(); - MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset); + MediaType type = new MediaType(mediaType, charset); metadata.set(Metadata.CONTENT_TYPE, type.toString()); // deprecated, see TIKA-431 metadata.set(Metadata.CONTENT_ENCODING, charset.name()); http://git-wip-us.apache.org/repos/asf/tika/blob/32d9ece8/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java index 9d9a138..17e5ba1 100644 --- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java +++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java @@ -196,7 +196,7 @@ public class TXTParserTest extends TikaTest { parser.parse( new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), new BodyContentHandler(), metadata, new ParseContext()); - assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("text/html; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated } @@ -268,7 +268,13 @@ public class TXTParserTest extends TikaTest { parser.parse( new ByteArrayInputStream(text.getBytes(UTF_8)), new BodyContentHandler(), r.metadata, new ParseContext()); - assertEquals("text/plain; charset=UTF-8", r.metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("application/binary; charset=UTF-8", r.metadata.get(Metadata.CONTENT_TYPE)); } + //TIKA-2047 + @Test + public void testSubclassingMimeTypesRemain() throws Exception { + XMLResult r = getXML("testVCalendar.vcs"); + assertEquals("text/x-vcalendar; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE)); + } }
