This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new e0ff3eb TIKA-2454: don't process the htmlbody. There could be
encoding conflicts. Fallback to what we were doing...just process text.
e0ff3eb is described below
commit e0ff3ebff559bcdad690498d40898d426c0b2b02
Author: tballison <[email protected]>
AuthorDate: Wed Aug 30 16:21:02 2017 -0400
TIKA-2454: don't process the htmlbody. There could be encoding conflicts.
Fallback to what we were doing...just process text.
---
.../apache/tika/parser/mbox/OutlookPSTParser.java | 23 ++++++++--------------
.../tika/parser/mbox/OutlookPSTParserTest.java | 2 +-
2 files changed, 9 insertions(+), 16 deletions(-)
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
index 43d6862..17df9be 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
@@ -206,21 +206,14 @@ public class OutlookPSTParser extends AbstractParser {
} catch (PSTException e) {
//swallow
}
- String html = pstMail.getBodyHTML();
- String bodyType = MediaType.TEXT_PLAIN.toString();
- byte[] mailContent = null;
- //try to get the html first
- if (html != null) {
- String txt = pstMail.getBody();
- if (txt != null && html.length() > txt.length()) {
- mailContent = html.getBytes(UTF_8);
- bodyType = MediaType.TEXT_HTML.toString();
- }
- }
- if (mailContent == null) {
- mailContent = pstMail.getBody().getBytes(UTF_8);
- }
- mailMetadata.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE, bodyType);
+ //we may want to experiment with working with the bodyHTML.
+ //However, because we can't get the raw bytes, we _could_ wind up
sending
+ //a UTF-8 byte representation of the html that has a conflicting
metaheader
+ //that causes the HTMLParser to get the encoding wrong. Better if we
could get
+ //the underlying bytes from the pstMail object...
+
+ byte[] mailContent = pstMail.getBody().getBytes(UTF_8);
+ mailMetadata.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE,
MediaType.TEXT_PLAIN.toString());
embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent),
handler, mailMetadata, true);
}
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
index d15fe8c..793a83f 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
@@ -149,6 +149,6 @@ public class OutlookPSTParserTest extends TikaTest {
}
}
}
- //TODO: figure out why the bold markup isn't coming through now that
we're processing the html
+ //TODO: figure out why the bold markup isn't coming through if we do
extract then parse the bodyhtml
}
}
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].