TIKA-2047 -- maintain mime info for mimes that are subtype of text/plain 
handled by TXTParser


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/41538121
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/41538121
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/41538121

Branch: refs/heads/master
Commit: 415381212291e843e9091f43f6db8c432eb02aa9
Parents: cc6f6dc
Author: tballison <talli...@mitre.org>
Authored: Wed Sep 21 13:36:13 2016 -0400
Committer: tballison <talli...@mitre.org>
Committed: Wed Sep 21 13:36:13 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                           |  2 ++
 .../java/org/apache/tika/parser/txt/TXTParser.java    | 12 +++++++++++-
 .../org/apache/tika/parser/txt/TXTParserTest.java     | 14 ++++++++++----
 3 files changed, 23 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/41538121/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 6597dc9..fc94e70 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.14 - ???
 
+  * Maintain passed-in mime in TXTParser (TIKA-2047).
+
   * Upgrade to POI.3-15 (TIKA-2013).
 
   * Upgrade to PDFBox 2.0.3 (TIKA-2051).

http://git-wip-us.apache.org/repos/asf/tika/blob/41538121/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
index 2b20495..2e7bb19 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
@@ -72,8 +72,18 @@ public class TXTParser extends AbstractParser {
         try (AutoDetectReader reader = new AutoDetectReader(
                 new CloseShieldInputStream(stream), metadata,
                 context.get(ServiceLoader.class, LOADER))) {
+            //try to get detected content type; could be a subclass of 
text/plain
+            //such as vcal, etc.
+            String incomingMime = metadata.get(Metadata.CONTENT_TYPE);
+            MediaType mediaType = MediaType.TEXT_PLAIN;
+            if (incomingMime != null) {
+                MediaType tmpMediaType = MediaType.parse(incomingMime);
+                if (tmpMediaType != null) {
+                    mediaType = tmpMediaType;
+                }
+            }
             Charset charset = reader.getCharset();
-            MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
+            MediaType type = new MediaType(mediaType, charset);
             metadata.set(Metadata.CONTENT_TYPE, type.toString());
             // deprecated, see TIKA-431
             metadata.set(Metadata.CONTENT_ENCODING, charset.name());

http://git-wip-us.apache.org/repos/asf/tika/blob/41538121/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java 
b/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
index e6b9fc7..0d31357 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
@@ -18,13 +18,13 @@ package org.apache.tika.parser.txt;
 
 import static java.nio.charset.StandardCharsets.ISO_8859_1;
 import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.apache.tika.TikaTest.assertContains;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNull;
 
 import java.io.ByteArrayInputStream;
 import java.io.StringWriter;
 
+import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
@@ -35,7 +35,7 @@ import org.junit.Test;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.helpers.DefaultHandler;
 
-public class TXTParserTest {
+public class TXTParserTest extends TikaTest {
 
     private Parser parser = new TXTParser();
 
@@ -196,7 +196,7 @@ public class TXTParserTest {
         parser.parse(
                 new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
                 new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("text/plain; charset=ISO-8859-15", 
metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("text/html; charset=ISO-8859-15", 
metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); 
// deprecated
     }
 
@@ -289,7 +289,13 @@ public class TXTParserTest {
         parser.parse(
                 new ByteArrayInputStream(text.getBytes(UTF_8)),
                 new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("text/plain; charset=UTF-8", 
metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("application/binary; charset=UTF-8", 
metadata.get(Metadata.CONTENT_TYPE));
     }
 
+    //TIKA-2047
+    @Test
+    public void testSubclassingMimeTypesRemain() throws Exception {
+        XMLResult r = getXML("testVCalendar.vcs");
+        assertEquals("text/x-vcalendar; charset=ISO-8859-1", 
r.metadata.get(Metadata.CONTENT_TYPE));
+    }
 }

Reply via email to