TIKA-2047 -- maintain mime info for mimes that are subtype of text/plain handled by TXTParser
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/41538121 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/41538121 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/41538121 Branch: refs/heads/master Commit: 415381212291e843e9091f43f6db8c432eb02aa9 Parents: cc6f6dc Author: tballison <talli...@mitre.org> Authored: Wed Sep 21 13:36:13 2016 -0400 Committer: tballison <talli...@mitre.org> Committed: Wed Sep 21 13:36:13 2016 -0400 ---------------------------------------------------------------------- CHANGES.txt | 2 ++ .../java/org/apache/tika/parser/txt/TXTParser.java | 12 +++++++++++- .../org/apache/tika/parser/txt/TXTParserTest.java | 14 ++++++++++---- 3 files changed, 23 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/41538121/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index 6597dc9..fc94e70 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,7 @@ Release 1.14 - ??? + * Maintain passed-in mime in TXTParser (TIKA-2047). + * Upgrade to POI.3-15 (TIKA-2013). * Upgrade to PDFBox 2.0.3 (TIKA-2051). http://git-wip-us.apache.org/repos/asf/tika/blob/41538121/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java index 2b20495..2e7bb19 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java @@ -72,8 +72,18 @@ public class TXTParser extends AbstractParser { try (AutoDetectReader reader = new AutoDetectReader( new CloseShieldInputStream(stream), metadata, context.get(ServiceLoader.class, LOADER))) { + //try to get detected content type; could be a subclass of text/plain + //such as vcal, etc. + String incomingMime = metadata.get(Metadata.CONTENT_TYPE); + MediaType mediaType = MediaType.TEXT_PLAIN; + if (incomingMime != null) { + MediaType tmpMediaType = MediaType.parse(incomingMime); + if (tmpMediaType != null) { + mediaType = tmpMediaType; + } + } Charset charset = reader.getCharset(); - MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset); + MediaType type = new MediaType(mediaType, charset); metadata.set(Metadata.CONTENT_TYPE, type.toString()); // deprecated, see TIKA-431 metadata.set(Metadata.CONTENT_ENCODING, charset.name()); http://git-wip-us.apache.org/repos/asf/tika/blob/41538121/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java index e6b9fc7..0d31357 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java @@ -18,13 +18,13 @@ package org.apache.tika.parser.txt; import static java.nio.charset.StandardCharsets.ISO_8859_1; import static java.nio.charset.StandardCharsets.UTF_8; -import static org.apache.tika.TikaTest.assertContains; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; import java.io.ByteArrayInputStream; import java.io.StringWriter; +import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; @@ -35,7 +35,7 @@ import org.junit.Test; import org.xml.sax.ContentHandler; import org.xml.sax.helpers.DefaultHandler; -public class TXTParserTest { +public class TXTParserTest extends TikaTest { private Parser parser = new TXTParser(); @@ -196,7 +196,7 @@ public class TXTParserTest { parser.parse( new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), new BodyContentHandler(), metadata, new ParseContext()); - assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("text/html; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated } @@ -289,7 +289,13 @@ public class TXTParserTest { parser.parse( new ByteArrayInputStream(text.getBytes(UTF_8)), new BodyContentHandler(), metadata, new ParseContext()); - assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("application/binary; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE)); } + //TIKA-2047 + @Test + public void testSubclassingMimeTypesRemain() throws Exception { + XMLResult r = getXML("testVCalendar.vcs"); + assertEquals("text/x-vcalendar; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE)); + } }