This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 83f1afa TIKA-2454: add OverrideDetector and allow PSTParser to
specify body content type as text or html -- to avoid incorrect auto-detection
of rfc/mbox, etc.
83f1afa is described below
commit 83f1afae3db65af966b13e6cc6dae3872aef630f
Author: tballison <[email protected]>
AuthorDate: Wed Aug 30 16:09:53 2017 -0400
TIKA-2454: add OverrideDetector and allow PSTParser to specify body content
type as text or html -- to avoid incorrect auto-detection of rfc/mbox, etc.
---
CHANGES.txt | 4 ++
.../org/apache/tika/detect/CompositeDetector.java | 7 ++++
.../org/apache/tika/detect/OverrideDetector.java | 41 +++++++++++++++++++++
.../apache/tika/metadata/TikaCoreProperties.java | 3 ++
.../apache/tika/parser/mbox/OutlookPSTParser.java | 27 ++++++++++++--
.../services/org.apache.tika.detect.Detector | 1 +
.../tika/parser/mbox/OutlookPSTParserTest.java | 15 ++++++++
.../test-documents/testPST_variousBodyTypes.pst | Bin 0 -> 271360 bytes
8 files changed, 95 insertions(+), 3 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index a66b15c..9759a5e 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,9 @@
Release 1.17 - ???
+ * Add OverrideDetector and allow PSTParser to specify body content type
+ as text or html -- to avoid incorrect auto-detection of
+ rfc/mbox, etc. (TIKA-2454)
+
* AutoDetectParser throws ZeroByteFileException for zero-byte files after
detection on the file extension (TIKA-2450).
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java
b/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java
index c77a04a..2f6b6d8 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java
@@ -25,6 +25,7 @@ import java.util.Collections;
import java.util.List;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
@@ -74,6 +75,12 @@ public class CompositeDetector implements Detector {
throws IOException {
MediaType type = MediaType.OCTET_STREAM;
for (Detector detector : getDetectors()) {
+ //short circuit via OverrideDetector
+ //can't rely on ordering because subsequent detector may
+ //change Override's to a specialization of Override's
+ if (detector instanceof OverrideDetector &&
metadata.get(TikaCoreProperties.CONTENT_TYPE_OVERRIDE) != null) {
+ return detector.detect(input, metadata);
+ }
MediaType detected = detector.detect(input, metadata);
if (registry.isSpecializationOf(detected, type)) {
type = detected;
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/OverrideDetector.java
b/tika-core/src/main/java/org/apache/tika/detect/OverrideDetector.java
new file mode 100644
index 0000000..318ede8
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/detect/OverrideDetector.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Use this to force a content type detection via the
+ * {@link TikaCoreProperties#CONTENT_TYPE_OVERRIDE} key in the metadata object.
+ */
+public class OverrideDetector implements Detector {
+
+ @Override
+ public MediaType detect(InputStream input, Metadata metadata) throws
IOException {
+ String type = metadata.get(TikaCoreProperties.CONTENT_TYPE_OVERRIDE);
+ if (type == null) {
+ return MediaType.OCTET_STREAM;
+ } else {
+ return MediaType.parse(type);
+ }
+ }
+}
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 50b751f..e97562d 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -109,6 +109,9 @@ public interface TikaCoreProperties {
public static final Property CONTENT_TYPE_HINT =
Property.internalText(HttpHeaders.CONTENT_TYPE+"-Hint");
+ Property CONTENT_TYPE_OVERRIDE =
+ Property.internalText(HttpHeaders.CONTENT_TYPE+"-Override");
+
/**
* @see DublinCore#FORMAT
*/
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
index 3caa53e..43d6862 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
@@ -32,9 +32,11 @@ import com.pff.PSTFile;
import com.pff.PSTFolder;
import com.pff.PSTMessage;
import com.pff.PSTRecipient;
+import org.apache.poi.ss.formula.functions.T;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Message;
@@ -44,7 +46,12 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OutlookExtractor;
+import org.apache.tika.parser.rtf.RTFParser;
+import org.apache.tika.parser.txt.TXTParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -59,13 +66,14 @@ public class OutlookPSTParser extends AbstractParser {
public static final MediaType MS_OUTLOOK_PST_MIMETYPE =
MediaType.application("vnd.ms-outlook-pst");
private static final Set<MediaType> SUPPORTED_TYPES =
singleton(MS_OUTLOOK_PST_MIMETYPE);
-
+ private static final Parser TEXT_PARSER = new TXTParser();
private static AttributesImpl createAttribute(String attName, String
attValue) {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", attName, attName, "CDATA", attValue);
return attributes;
}
+
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@@ -198,8 +206,21 @@ public class OutlookPSTParser extends AbstractParser {
} catch (PSTException e) {
//swallow
}
-
- byte[] mailContent = pstMail.getBody().getBytes(UTF_8);
+ String html = pstMail.getBodyHTML();
+ String bodyType = MediaType.TEXT_PLAIN.toString();
+ byte[] mailContent = null;
+ //try to get the html first
+ if (html != null) {
+ String txt = pstMail.getBody();
+ if (txt != null && html.length() > txt.length()) {
+ mailContent = html.getBytes(UTF_8);
+ bodyType = MediaType.TEXT_HTML.toString();
+ }
+ }
+ if (mailContent == null) {
+ mailContent = pstMail.getBody().getBytes(UTF_8);
+ }
+ mailMetadata.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE, bodyType);
embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent),
handler, mailMetadata, true);
}
diff --git
a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
index e86cfd2..8a3d85f 100644
---
a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
+++
b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
@@ -13,5 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+org.apache.tika.detect.OverrideDetector
org.apache.tika.parser.microsoft.POIFSContainerDetector
org.apache.tika.parser.pkg.ZipContainerDetector
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
index 26597a3..d15fe8c 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.mbox;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
import java.io.IOException;
import java.io.InputStream;
@@ -136,4 +137,18 @@ public class OutlookPSTParserTest extends TikaTest {
assertEquals("[email protected]",
m6.get(Message.MESSAGE_FROM_EMAIL));
}
+
+ @Test
+ public void testOverrideDetector() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testPST_variousBodyTypes.pst");
+ assertEquals(5, metadataList.size());//before the fix that prevents
the RFC parser, this was 6
+ for (Metadata metadata : metadataList) {
+ for (String v : metadata.getValues("X-Parsed-By")) {
+ if (v.contains("RFC822Parser")) {
+ fail("RFCParser should never be called");
+ }
+ }
+ }
+ //TODO: figure out why the bold markup isn't coming through now that
we're processing the html
+ }
}
diff --git
a/tika-parsers/src/test/resources/test-documents/testPST_variousBodyTypes.pst
b/tika-parsers/src/test/resources/test-documents/testPST_variousBodyTypes.pst
new file mode 100644
index 0000000..5846e05
Binary files /dev/null and
b/tika-parsers/src/test/resources/test-documents/testPST_variousBodyTypes.pst
differ
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].