This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new c8b9b44  TIKA-2626
c8b9b44 is described below

commit c8b9b4409c72ded92d588660274977d5a6fdb539
Author: tballison <talli...@mitre.org>
AuthorDate: Fri Apr 6 13:48:31 2018 -0400

    TIKA-2626
---
 .../tika/parser/mail/MailContentHandler.java       | 25 ++++++++++++++--------
 .../org/apache/tika/parser/mail/RFC822Parser.java  | 18 ++++++++++++++--
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 40a3379..fa30ee0 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -149,14 +149,13 @@ class MailContentHandler implements ContentHandler {
     private final boolean extractAllAlternatives;
     private final EmbeddedDocumentExtractor extractor;
     private final Detector detector;
-
     //this is used to buffer a multipart body that
     //keeps track of multipart/alternative and its children
     private Stack<Part> alternativePartBuffer = new Stack<>();
 
     private Stack<BodyDescriptor> parts = new Stack<>();
 
-    MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata,
+    MailContentHandler(XHTMLContentHandler xhtml, Detector detector, Metadata 
metadata,
                        ParseContext context, boolean strictParsing, boolean 
extractAllAlternatives) {
         this.handler = xhtml;
         this.metadata = metadata;
@@ -169,7 +168,7 @@ class MailContentHandler implements ContentHandler {
 
         // Was an EmbeddedDocumentExtractor explicitly supplied?
         this.extractor = 
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
-        this.detector = new EmbeddedDocumentUtil(context).getDetector();
+        this.detector = detector;
     }
 
     @Override
@@ -221,7 +220,7 @@ class MailContentHandler implements ContentHandler {
             ByteArrayOutputStream bos = new ByteArrayOutputStream();
             IOUtils.copy(is, bos);
             byte[] bytes = bos.toByteArray();
-            if (isTextOrHtml(submd, bytes)) {
+            if (detectTextOrHtml(submd, bytes)) {
                 handleInlineBodyPart(new BodyContents(submd, 
bos.toByteArray()));
             } else {
                 //else handle as you would any other embedded content
@@ -237,15 +236,23 @@ class MailContentHandler implements ContentHandler {
         }
     }
 
-    private boolean isTextOrHtml(Metadata submd, byte[] bytes) {
+    private boolean detectTextOrHtml(Metadata submd, byte[] bytes) {
         String mediaTypeString = submd.get(Metadata.CONTENT_TYPE);
-        if (mediaTypeString != null && mediaTypeString.startsWith("text")) {
-            return true;
+        if (mediaTypeString != null) {
+            if (mediaTypeString.startsWith("text")) {
+                return true;
+            } else {
+                return false;
+            }
         }
         try (TikaInputStream tis = TikaInputStream.get(bytes)) {
             MediaType mediaType = detector.detect(tis, submd);
-            if (mediaType != null && mediaType.toString().startsWith("text")) {
-                return true;
+            if (mediaType != null) {
+                //detect only once
+                submd.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE, 
mediaType.toString());
+                if (mediaType.toString().startsWith("text")) {
+                    return true;
+                }
             }
         } catch (IOException e) {
 
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
index 06a094f..ffc4d26 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
@@ -26,7 +26,9 @@ import 
org.apache.james.mime4j.message.DefaultBodyDescriptorBuilder;
 import org.apache.james.mime4j.parser.MimeStreamParser;
 import org.apache.james.mime4j.stream.MimeConfig;
 import org.apache.tika.config.Field;
+import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -54,6 +56,10 @@ public class RFC822Parser extends AbstractParser {
     private static final Set<MediaType> SUPPORTED_TYPES = Collections
             .singleton(MediaType.parse("message/rfc822"));
 
+    //rely on the detector to be thread-safe
+    //built lazily and then reused
+    private Detector detector;
+
     @Field
     private boolean extractAllAlternatives = false;
 
@@ -71,12 +77,20 @@ public class RFC822Parser extends AbstractParser {
                 .build();
 
         config = context.get(MimeConfig.class, config);
-
+        Detector localDetector = context.get(Detector.class);
+        if (localDetector == null) {
+            //lazily load this if necessary
+            if (detector == null) {
+                EmbeddedDocumentUtil embeddedDocumentUtil = new 
EmbeddedDocumentUtil(context);
+                detector = embeddedDocumentUtil.getDetector();
+            }
+            localDetector = detector;
+        }
         MimeStreamParser parser = new MimeStreamParser(config, null, new 
DefaultBodyDescriptorBuilder());
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
 
         MailContentHandler mch = new MailContentHandler(
-                xhtml, metadata, context, config.isStrictParsing(),
+                xhtml, localDetector, metadata, context, 
config.isStrictParsing(),
                 extractAllAlternatives);
         parser.setContentHandler(mch);
         parser.setContentDecoding(true);

-- 
To stop receiving notification emails like this one, please contact
talli...@apache.org.

Reply via email to