This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new e0ff3eb  TIKA-2454: don't process the htmlbody.  There could be 
encoding conflicts.  Fallback to what we were doing...just process text.
e0ff3eb is described below

commit e0ff3ebff559bcdad690498d40898d426c0b2b02
Author: tballison <[email protected]>
AuthorDate: Wed Aug 30 16:21:02 2017 -0400

    TIKA-2454: don't process the htmlbody.  There could be encoding conflicts.  
Fallback to what we were doing...just process text.
---
 .../apache/tika/parser/mbox/OutlookPSTParser.java  | 23 ++++++++--------------
 .../tika/parser/mbox/OutlookPSTParserTest.java     |  2 +-
 2 files changed, 9 insertions(+), 16 deletions(-)

diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
index 43d6862..17df9be 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
@@ -206,21 +206,14 @@ public class OutlookPSTParser extends AbstractParser {
         } catch (PSTException e) {
             //swallow
         }
-        String html = pstMail.getBodyHTML();
-        String bodyType = MediaType.TEXT_PLAIN.toString();
-        byte[] mailContent = null;
-        //try to get the html first
-        if (html != null) {
-            String txt = pstMail.getBody();
-            if (txt != null && html.length() > txt.length()) {
-                mailContent = html.getBytes(UTF_8);
-                bodyType = MediaType.TEXT_HTML.toString();
-            }
-        }
-        if (mailContent == null) {
-            mailContent = pstMail.getBody().getBytes(UTF_8);
-        }
-        mailMetadata.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE, bodyType);
+        //we may want to experiment with working with the bodyHTML.
+        //However, because we can't get the raw bytes, we _could_ wind up 
sending
+        //a UTF-8 byte representation of the html that has a conflicting 
metaheader
+        //that causes the HTMLParser to get the encoding wrong.  Better if we 
could get
+        //the underlying bytes from the pstMail object...
+
+        byte[] mailContent = pstMail.getBody().getBytes(UTF_8);
+        mailMetadata.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE, 
MediaType.TEXT_PLAIN.toString());
         embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), 
handler, mailMetadata, true);
     }
 
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
index d15fe8c..793a83f 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
@@ -149,6 +149,6 @@ public class OutlookPSTParserTest extends TikaTest {
                 }
             }
         }
-        //TODO: figure out why the bold markup isn't coming through now that 
we're processing the html
+        //TODO: figure out why the bold markup isn't coming through if we do 
extract then parse the bodyhtml
     }
 }

-- 
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].

Reply via email to