http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java index 45f0388..da046aa 100644 --- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java +++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java @@ -1,43 +1,43 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.html; - -import java.util.Locale; - -/** - * Alternative HTML mapping rules that pass the input HTML as-is without any - * modifications. - * - * @since Apache Tika 0.8 - */ -public class IdentityHtmlMapper implements HtmlMapper { - - public static final HtmlMapper INSTANCE = new IdentityHtmlMapper(); - - public boolean isDiscardElement(String name) { - return false; - } - - public String mapSafeAttribute(String elementName, String attributeName) { - return attributeName.toLowerCase(Locale.ENGLISH); - } - - public String mapSafeElement(String name) { - return name.toLowerCase(Locale.ENGLISH); - } - -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.html; + +import java.util.Locale; + +/** + * Alternative HTML mapping rules that pass the input HTML as-is without any + * modifications. + * + * @since Apache Tika 0.8 + */ +public class IdentityHtmlMapper implements HtmlMapper { + + public static final HtmlMapper INSTANCE = new IdentityHtmlMapper(); + + public boolean isDiscardElement(String name) { + return false; + } + + public String mapSafeAttribute(String elementName, String attributeName) { + return attributeName.toLowerCase(Locale.ENGLISH); + } + + public String mapSafeElement(String name) { + return name.toLowerCase(Locale.ENGLISH); + } + +}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java index 336ae75..221a87a 100644 --- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java +++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java @@ -1,78 +1,78 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.html; - -import javax.xml.XMLConstants; -import java.util.Locale; - -import org.apache.tika.sax.ContentHandlerDecorator; -import org.xml.sax.Attributes; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.AttributesImpl; - -/** - * Content handler decorator that downgrades XHTML elements to - * old-style HTML elements before passing them on to the decorated - * content handler. This downgrading consists of dropping all namespaces - * (and namespaced attributes) and uppercasing all element names. - * Used by the {@link HtmlParser} to make all incoming HTML look the same. - */ -class XHTMLDowngradeHandler extends ContentHandlerDecorator { - - public XHTMLDowngradeHandler(ContentHandler handler) { - super(handler); - } - - @Override - public void startElement( - String uri, String localName, String name, Attributes atts) - throws SAXException { - String upper = localName.toUpperCase(Locale.ENGLISH); - - AttributesImpl attributes = new AttributesImpl(); - for (int i = 0; i < atts.getLength(); i++) { - String auri = atts.getURI(i); - String local = atts.getLocalName(i); - String qname = atts.getQName(i); - if (XMLConstants.NULL_NS_URI.equals(auri) - && !local.equals(XMLConstants.XMLNS_ATTRIBUTE) - && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) { - attributes.addAttribute( - auri, local, qname, atts.getType(i), atts.getValue(i)); - } - } - - super.startElement(XMLConstants.NULL_NS_URI, upper, upper, attributes); - } - - @Override - public void endElement(String uri, String localName, String name) - throws SAXException { - String upper = localName.toUpperCase(Locale.ENGLISH); - super.endElement(XMLConstants.NULL_NS_URI, upper, upper); - } - - @Override - public void startPrefixMapping(String prefix, String uri) { - } - - @Override - public void endPrefixMapping(String prefix) { - } - -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.html; + +import javax.xml.XMLConstants; +import java.util.Locale; + +import org.apache.tika.sax.ContentHandlerDecorator; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +/** + * Content handler decorator that downgrades XHTML elements to + * old-style HTML elements before passing them on to the decorated + * content handler. This downgrading consists of dropping all namespaces + * (and namespaced attributes) and uppercasing all element names. + * Used by the {@link HtmlParser} to make all incoming HTML look the same. + */ +class XHTMLDowngradeHandler extends ContentHandlerDecorator { + + public XHTMLDowngradeHandler(ContentHandler handler) { + super(handler); + } + + @Override + public void startElement( + String uri, String localName, String name, Attributes atts) + throws SAXException { + String upper = localName.toUpperCase(Locale.ENGLISH); + + AttributesImpl attributes = new AttributesImpl(); + for (int i = 0; i < atts.getLength(); i++) { + String auri = atts.getURI(i); + String local = atts.getLocalName(i); + String qname = atts.getQName(i); + if (XMLConstants.NULL_NS_URI.equals(auri) + && !local.equals(XMLConstants.XMLNS_ATTRIBUTE) + && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) { + attributes.addAttribute( + auri, local, qname, atts.getType(i), atts.getValue(i)); + } + } + + super.startElement(XMLConstants.NULL_NS_URI, upper, upper, attributes); + } + + @Override + public void endElement(String uri, String localName, String name) + throws SAXException { + String upper = localName.toUpperCase(Locale.ENGLISH); + super.endElement(XMLConstants.NULL_NS_URI, upper, upper); + } + + @Override + public void startPrefixMapping(String prefix, String uri) { + } + + @Override + public void endPrefixMapping(String prefix) { + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java index 9740eff..2c8942e 100644 --- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java +++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java @@ -1,376 +1,376 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.mail; - -import java.io.IOException; -import java.io.InputStream; -import java.text.DateFormat; -import java.text.DateFormatSymbols; -import java.text.ParseException; -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.Locale; -import java.util.TimeZone; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.james.mime4j.MimeException; -import org.apache.james.mime4j.codec.DecodeMonitor; -import org.apache.james.mime4j.codec.DecoderUtil; -import org.apache.james.mime4j.dom.address.Address; -import org.apache.james.mime4j.dom.address.AddressList; -import org.apache.james.mime4j.dom.address.Mailbox; -import org.apache.james.mime4j.dom.address.MailboxList; -import org.apache.james.mime4j.dom.field.AddressListField; -import org.apache.james.mime4j.dom.field.DateTimeField; -import org.apache.james.mime4j.dom.field.MailboxListField; -import org.apache.james.mime4j.dom.field.ParsedField; -import org.apache.james.mime4j.dom.field.UnstructuredField; -import org.apache.james.mime4j.field.LenientFieldParser; -import org.apache.james.mime4j.parser.ContentHandler; -import org.apache.james.mime4j.stream.BodyDescriptor; -import org.apache.james.mime4j.stream.Field; -import org.apache.tika.config.TikaConfig; -import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.sax.XHTMLContentHandler; -import org.xml.sax.SAXException; - -import static org.apache.tika.utils.DateUtils.MIDDAY; -import static org.apache.tika.utils.DateUtils.UTC; - -/** - * Bridge between mime4j's content handler and the generic Sax content handler - * used by Tika. See - * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ContentHandler.html - */ -class MailContentHandler implements ContentHandler { - - //TIKA-1970 Mac Mail's format - private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN = - Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z"); - - //find a time ending in am/pm without a space: 10:30am and - //use this pattern to insert space: 10:30 am - private static final Pattern AM_PM = Pattern.compile("(?i)(\\d)([ap]m)\\b"); - - private static final DateFormat[] ALTERNATE_DATE_FORMATS = new DateFormat[] { - //note that the string is "cleaned" before processing: - //1) condense multiple whitespace to single space - //2) trim() - //3) strip out commas - //4) insert space before am/pm - - //May 16 2016 1:32am - createDateFormat("MMM dd yy hh:mm a", null), - - //this is a standard pattern handled by mime4j; - //but mime4j fails with leading whitespace - createDateFormat("EEE d MMM yy HH:mm:ss Z", UTC), - - createDateFormat("EEE d MMM yy HH:mm:ss z", UTC), - - createDateFormat("EEE d MMM yy HH:mm:ss", null),// no timezone - - createDateFormat("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 2016 1:32 PM - - //16 May 2016 at 09:30:32 GMT+1 (Mac Mail TIKA-1970) - createDateFormat("d MMM yy 'at' HH:mm:ss z", UTC), // UTC/Zulu - - createDateFormat("yy-MM-dd HH:mm:ss", null), - - createDateFormat("MM/dd/yy hh:mm a", null, false), - - //now dates without times - createDateFormat("MMM d yy", MIDDAY, false), - createDateFormat("EEE d MMM yy", MIDDAY, false), - createDateFormat("d MMM yy", MIDDAY, false), - createDateFormat("yy/MM/dd", MIDDAY, false), - createDateFormat("MM/dd/yy", MIDDAY, false) - }; - - private static DateFormat createDateFormat(String format, TimeZone timezone) { - return createDateFormat(format, timezone, true); - } - - private static DateFormat createDateFormat(String format, TimeZone timezone, boolean isLenient) { - SimpleDateFormat sdf = - new SimpleDateFormat(format, new DateFormatSymbols(Locale.US)); - if (timezone != null) { - sdf.setTimeZone(timezone); - } - sdf.setLenient(isLenient); - return sdf; - } - - private boolean strictParsing = false; - - private XHTMLContentHandler handler; - private Metadata metadata; - private EmbeddedDocumentExtractor extractor; - - private boolean inPart = false; - - MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata, ParseContext context, boolean strictParsing) { - this.handler = xhtml; - this.metadata = metadata; - this.strictParsing = strictParsing; - - // Fetch / Build an EmbeddedDocumentExtractor with which - // to handle/process the parts/attachments - - // Was an EmbeddedDocumentExtractor explicitly supplied? - this.extractor = context.get(EmbeddedDocumentExtractor.class); - - // If there's no EmbeddedDocumentExtractor, then try using a normal parser - // This will ensure that the contents are made available to the user, so - // the see the text, but without fine-grained control/extraction - // (This also maintains backward compatibility with older versions!) - if (this.extractor == null) { - // If the user gave a parser, use that, if not the default - Parser parser = context.get(AutoDetectParser.class); - if (parser == null) { - parser = context.get(Parser.class); - } - if (parser == null) { - TikaConfig tikaConfig = context.get(TikaConfig.class); - if (tikaConfig == null) { - tikaConfig = TikaConfig.getDefaultConfig(); - } - parser = new AutoDetectParser(tikaConfig.getParser()); - } - ParseContext ctx = new ParseContext(); - ctx.set(Parser.class, parser); - extractor = new ParsingEmbeddedDocumentExtractor(ctx); - } - } - - public void body(BodyDescriptor body, InputStream is) throws MimeException, - IOException { - // use a different metadata object - // in order to specify the mime type of the - // sub part without damaging the main metadata - - Metadata submd = new Metadata(); - submd.set(Metadata.CONTENT_TYPE, body.getMimeType()); - submd.set(Metadata.CONTENT_ENCODING, body.getCharset()); - - try { - if (extractor.shouldParseEmbedded(submd)) { - extractor.parseEmbedded(is, handler, submd, false); - } - } catch (SAXException e) { - throw new MimeException(e); - } - } - - public void endBodyPart() throws MimeException { - try { - handler.endElement("p"); - handler.endElement("div"); - } catch (SAXException e) { - throw new MimeException(e); - } - } - - public void endHeader() throws MimeException { - } - - public void startMessage() throws MimeException { - try { - handler.startDocument(); - } catch (SAXException e) { - throw new MimeException(e); - } - } - - public void endMessage() throws MimeException { - try { - handler.endDocument(); - } catch (SAXException e) { - throw new MimeException(e); - } - } - - public void endMultipart() throws MimeException { - inPart = false; - } - - public void epilogue(InputStream is) throws MimeException, IOException { - } - - /** - * Header for the whole message or its parts - * - * @see <a href="http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/"> - * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/</a> - * Field.html - */ - public void field(Field field) throws MimeException { - // inPart indicates whether these metadata correspond to the - // whole message or its parts - if (inPart) { - return; - } - - try { - String fieldname = field.getName(); - ParsedField parsedField = LenientFieldParser.getParser().parse( - field, DecodeMonitor.SILENT); - if (fieldname.equalsIgnoreCase("From")) { - MailboxListField fromField = (MailboxListField) parsedField; - MailboxList mailboxList = fromField.getMailboxList(); - if (fromField.isValidField() && mailboxList != null) { - for (Address address : mailboxList) { - String from = getDisplayString(address); - metadata.add(Metadata.MESSAGE_FROM, from); - metadata.add(TikaCoreProperties.CREATOR, from); - } - } else { - String from = stripOutFieldPrefix(field, "From:"); - if (from.startsWith("<")) { - from = from.substring(1); - } - if (from.endsWith(">")) { - from = from.substring(0, from.length() - 1); - } - metadata.add(Metadata.MESSAGE_FROM, from); - metadata.add(TikaCoreProperties.CREATOR, from); - } - } else if (fieldname.equalsIgnoreCase("Subject")) { - metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE, - ((UnstructuredField) parsedField).getValue()); - } else if (fieldname.equalsIgnoreCase("To")) { - processAddressList(parsedField, "To:", Metadata.MESSAGE_TO); - } else if (fieldname.equalsIgnoreCase("CC")) { - processAddressList(parsedField, "Cc:", Metadata.MESSAGE_CC); - } else if (fieldname.equalsIgnoreCase("BCC")) { - processAddressList(parsedField, "Bcc:", Metadata.MESSAGE_BCC); - } else if (fieldname.equalsIgnoreCase("Date")) { - DateTimeField dateField = (DateTimeField) parsedField; - Date date = dateField.getDate(); - if (date == null) { - date = tryOtherDateFormats(field.getBody()); - } - metadata.set(TikaCoreProperties.CREATED, date); - } - } catch (RuntimeException me) { - if (strictParsing) { - throw me; - } - } - } - - private static synchronized Date tryOtherDateFormats(String text) { - if (text == null) { - return null; - } - //strip out additional spaces and trim - text = text.replaceAll("\\s+", " ").trim(); - - //strip out commas - text = text.replaceAll(",", ""); - Matcher matcher = GENERAL_TIME_ZONE_NO_MINUTES_PATTERN.matcher(text); - if (matcher.find()) { - text = matcher.replaceFirst("GMT$1$2:00"); - } - - matcher = AM_PM.matcher(text); - if (matcher.find()) { - text = matcher.replaceFirst("$1 $2"); - } - - for (DateFormat format : ALTERNATE_DATE_FORMATS) { - try { - return format.parse(text); - } catch (ParseException e) { - } - } - return null; - } - - private void processAddressList(ParsedField field, String addressListType, - String metadataField) throws MimeException { - AddressListField toField = (AddressListField) field; - if (toField.isValidField()) { - AddressList addressList = toField.getAddressList(); - for (Address address : addressList) { - metadata.add(metadataField, getDisplayString(address)); - } - } else { - String to = stripOutFieldPrefix(field, - addressListType); - for (String eachTo : to.split(",")) { - metadata.add(metadataField, eachTo.trim()); - } - } - } - - private String getDisplayString(Address address) { - if (address instanceof Mailbox) { - Mailbox mailbox = (Mailbox) address; - String name = mailbox.getName(); - if (name != null && name.length() > 0) { - name = DecoderUtil.decodeEncodedWords(name, DecodeMonitor.SILENT); - return name + " <" + mailbox.getAddress() + ">"; - } else { - return mailbox.getAddress(); - } - } else { - return address.toString(); - } - } - - public void preamble(InputStream is) throws MimeException, IOException { - } - - public void raw(InputStream is) throws MimeException, IOException { - } - - public void startBodyPart() throws MimeException { - try { - handler.startElement("div", "class", "email-entry"); - handler.startElement("p"); - } catch (SAXException e) { - throw new MimeException(e); - } - } - - public void startHeader() throws MimeException { - // TODO Auto-generated method stub - - } - - public void startMultipart(BodyDescriptor descr) throws MimeException { - inPart = true; - } - - private String stripOutFieldPrefix(Field field, String fieldname) { - String temp = field.getRaw().toString(); - int loc = fieldname.length(); - while (temp.charAt(loc) == ' ') { - loc++; - } - return temp.substring(loc); - } - -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mail; + +import java.io.IOException; +import java.io.InputStream; +import java.text.DateFormat; +import java.text.DateFormatSymbols; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.Locale; +import java.util.TimeZone; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.james.mime4j.MimeException; +import org.apache.james.mime4j.codec.DecodeMonitor; +import org.apache.james.mime4j.codec.DecoderUtil; +import org.apache.james.mime4j.dom.address.Address; +import org.apache.james.mime4j.dom.address.AddressList; +import org.apache.james.mime4j.dom.address.Mailbox; +import org.apache.james.mime4j.dom.address.MailboxList; +import org.apache.james.mime4j.dom.field.AddressListField; +import org.apache.james.mime4j.dom.field.DateTimeField; +import org.apache.james.mime4j.dom.field.MailboxListField; +import org.apache.james.mime4j.dom.field.ParsedField; +import org.apache.james.mime4j.dom.field.UnstructuredField; +import org.apache.james.mime4j.field.LenientFieldParser; +import org.apache.james.mime4j.parser.ContentHandler; +import org.apache.james.mime4j.stream.BodyDescriptor; +import org.apache.james.mime4j.stream.Field; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +import static org.apache.tika.utils.DateUtils.MIDDAY; +import static org.apache.tika.utils.DateUtils.UTC; + +/** + * Bridge between mime4j's content handler and the generic Sax content handler + * used by Tika. See + * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ContentHandler.html + */ +class MailContentHandler implements ContentHandler { + + //TIKA-1970 Mac Mail's format + private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN = + Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z"); + + //find a time ending in am/pm without a space: 10:30am and + //use this pattern to insert space: 10:30 am + private static final Pattern AM_PM = Pattern.compile("(?i)(\\d)([ap]m)\\b"); + + private static final DateFormat[] ALTERNATE_DATE_FORMATS = new DateFormat[] { + //note that the string is "cleaned" before processing: + //1) condense multiple whitespace to single space + //2) trim() + //3) strip out commas + //4) insert space before am/pm + + //May 16 2016 1:32am + createDateFormat("MMM dd yy hh:mm a", null), + + //this is a standard pattern handled by mime4j; + //but mime4j fails with leading whitespace + createDateFormat("EEE d MMM yy HH:mm:ss Z", UTC), + + createDateFormat("EEE d MMM yy HH:mm:ss z", UTC), + + createDateFormat("EEE d MMM yy HH:mm:ss", null),// no timezone + + createDateFormat("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 2016 1:32 PM + + //16 May 2016 at 09:30:32 GMT+1 (Mac Mail TIKA-1970) + createDateFormat("d MMM yy 'at' HH:mm:ss z", UTC), // UTC/Zulu + + createDateFormat("yy-MM-dd HH:mm:ss", null), + + createDateFormat("MM/dd/yy hh:mm a", null, false), + + //now dates without times + createDateFormat("MMM d yy", MIDDAY, false), + createDateFormat("EEE d MMM yy", MIDDAY, false), + createDateFormat("d MMM yy", MIDDAY, false), + createDateFormat("yy/MM/dd", MIDDAY, false), + createDateFormat("MM/dd/yy", MIDDAY, false) + }; + + private static DateFormat createDateFormat(String format, TimeZone timezone) { + return createDateFormat(format, timezone, true); + } + + private static DateFormat createDateFormat(String format, TimeZone timezone, boolean isLenient) { + SimpleDateFormat sdf = + new SimpleDateFormat(format, new DateFormatSymbols(Locale.US)); + if (timezone != null) { + sdf.setTimeZone(timezone); + } + sdf.setLenient(isLenient); + return sdf; + } + + private boolean strictParsing = false; + + private XHTMLContentHandler handler; + private Metadata metadata; + private EmbeddedDocumentExtractor extractor; + + private boolean inPart = false; + + MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata, ParseContext context, boolean strictParsing) { + this.handler = xhtml; + this.metadata = metadata; + this.strictParsing = strictParsing; + + // Fetch / Build an EmbeddedDocumentExtractor with which + // to handle/process the parts/attachments + + // Was an EmbeddedDocumentExtractor explicitly supplied? + this.extractor = context.get(EmbeddedDocumentExtractor.class); + + // If there's no EmbeddedDocumentExtractor, then try using a normal parser + // This will ensure that the contents are made available to the user, so + // the see the text, but without fine-grained control/extraction + // (This also maintains backward compatibility with older versions!) + if (this.extractor == null) { + // If the user gave a parser, use that, if not the default + Parser parser = context.get(AutoDetectParser.class); + if (parser == null) { + parser = context.get(Parser.class); + } + if (parser == null) { + TikaConfig tikaConfig = context.get(TikaConfig.class); + if (tikaConfig == null) { + tikaConfig = TikaConfig.getDefaultConfig(); + } + parser = new AutoDetectParser(tikaConfig.getParser()); + } + ParseContext ctx = new ParseContext(); + ctx.set(Parser.class, parser); + extractor = new ParsingEmbeddedDocumentExtractor(ctx); + } + } + + public void body(BodyDescriptor body, InputStream is) throws MimeException, + IOException { + // use a different metadata object + // in order to specify the mime type of the + // sub part without damaging the main metadata + + Metadata submd = new Metadata(); + submd.set(Metadata.CONTENT_TYPE, body.getMimeType()); + submd.set(Metadata.CONTENT_ENCODING, body.getCharset()); + + try { + if (extractor.shouldParseEmbedded(submd)) { + extractor.parseEmbedded(is, handler, submd, false); + } + } catch (SAXException e) { + throw new MimeException(e); + } + } + + public void endBodyPart() throws MimeException { + try { + handler.endElement("p"); + handler.endElement("div"); + } catch (SAXException e) { + throw new MimeException(e); + } + } + + public void endHeader() throws MimeException { + } + + public void startMessage() throws MimeException { + try { + handler.startDocument(); + } catch (SAXException e) { + throw new MimeException(e); + } + } + + public void endMessage() throws MimeException { + try { + handler.endDocument(); + } catch (SAXException e) { + throw new MimeException(e); + } + } + + public void endMultipart() throws MimeException { + inPart = false; + } + + public void epilogue(InputStream is) throws MimeException, IOException { + } + + /** + * Header for the whole message or its parts + * + * @see <a href="http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/"> + * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/</a> + * Field.html + */ + public void field(Field field) throws MimeException { + // inPart indicates whether these metadata correspond to the + // whole message or its parts + if (inPart) { + return; + } + + try { + String fieldname = field.getName(); + ParsedField parsedField = LenientFieldParser.getParser().parse( + field, DecodeMonitor.SILENT); + if (fieldname.equalsIgnoreCase("From")) { + MailboxListField fromField = (MailboxListField) parsedField; + MailboxList mailboxList = fromField.getMailboxList(); + if (fromField.isValidField() && mailboxList != null) { + for (Address address : mailboxList) { + String from = getDisplayString(address); + metadata.add(Metadata.MESSAGE_FROM, from); + metadata.add(TikaCoreProperties.CREATOR, from); + } + } else { + String from = stripOutFieldPrefix(field, "From:"); + if (from.startsWith("<")) { + from = from.substring(1); + } + if (from.endsWith(">")) { + from = from.substring(0, from.length() - 1); + } + metadata.add(Metadata.MESSAGE_FROM, from); + metadata.add(TikaCoreProperties.CREATOR, from); + } + } else if (fieldname.equalsIgnoreCase("Subject")) { + metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE, + ((UnstructuredField) parsedField).getValue()); + } else if (fieldname.equalsIgnoreCase("To")) { + processAddressList(parsedField, "To:", Metadata.MESSAGE_TO); + } else if (fieldname.equalsIgnoreCase("CC")) { + processAddressList(parsedField, "Cc:", Metadata.MESSAGE_CC); + } else if (fieldname.equalsIgnoreCase("BCC")) { + processAddressList(parsedField, "Bcc:", Metadata.MESSAGE_BCC); + } else if (fieldname.equalsIgnoreCase("Date")) { + DateTimeField dateField = (DateTimeField) parsedField; + Date date = dateField.getDate(); + if (date == null) { + date = tryOtherDateFormats(field.getBody()); + } + metadata.set(TikaCoreProperties.CREATED, date); + } + } catch (RuntimeException me) { + if (strictParsing) { + throw me; + } + } + } + + private static synchronized Date tryOtherDateFormats(String text) { + if (text == null) { + return null; + } + //strip out additional spaces and trim + text = text.replaceAll("\\s+", " ").trim(); + + //strip out commas + text = text.replaceAll(",", ""); + Matcher matcher = GENERAL_TIME_ZONE_NO_MINUTES_PATTERN.matcher(text); + if (matcher.find()) { + text = matcher.replaceFirst("GMT$1$2:00"); + } + + matcher = AM_PM.matcher(text); + if (matcher.find()) { + text = matcher.replaceFirst("$1 $2"); + } + + for (DateFormat format : ALTERNATE_DATE_FORMATS) { + try { + return format.parse(text); + } catch (ParseException e) { + } + } + return null; + } + + private void processAddressList(ParsedField field, String addressListType, + String metadataField) throws MimeException { + AddressListField toField = (AddressListField) field; + if (toField.isValidField()) { + AddressList addressList = toField.getAddressList(); + for (Address address : addressList) { + metadata.add(metadataField, getDisplayString(address)); + } + } else { + String to = stripOutFieldPrefix(field, + addressListType); + for (String eachTo : to.split(",")) { + metadata.add(metadataField, eachTo.trim()); + } + } + } + + private String getDisplayString(Address address) { + if (address instanceof Mailbox) { + Mailbox mailbox = (Mailbox) address; + String name = mailbox.getName(); + if (name != null && name.length() > 0) { + name = DecoderUtil.decodeEncodedWords(name, DecodeMonitor.SILENT); + return name + " <" + mailbox.getAddress() + ">"; + } else { + return mailbox.getAddress(); + } + } else { + return address.toString(); + } + } + + public void preamble(InputStream is) throws MimeException, IOException { + } + + public void raw(InputStream is) throws MimeException, IOException { + } + + public void startBodyPart() throws MimeException { + try { + handler.startElement("div", "class", "email-entry"); + handler.startElement("p"); + } catch (SAXException e) { + throw new MimeException(e); + } + } + + public void startHeader() throws MimeException { + // TODO Auto-generated method stub + + } + + public void startMultipart(BodyDescriptor descr) throws MimeException { + inPart = true; + } + + private String stripOutFieldPrefix(Field field, String fieldname) { + String temp = field.getRaw().toString(); + int loc = fieldname.length(); + while (temp.charAt(loc) == ' ') { + loc++; + } + return temp.substring(loc); + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java index 9ac02a7..6299d3f 100644 --- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java +++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java @@ -1,95 +1,95 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.mail; - -import java.io.IOException; -import java.io.InputStream; -import java.util.Collections; -import java.util.Set; - -import org.apache.james.mime4j.MimeException; -import org.apache.james.mime4j.parser.MimeStreamParser; -import org.apache.james.mime4j.stream.MimeConfig; -import org.apache.tika.exception.TikaException; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AbstractParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.sax.XHTMLContentHandler; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - -/** - * Uses apache-mime4j to parse emails. Each part is treated with the - * corresponding parser and displayed within elements. - * <p/> - * A {@link MimeEntityConfig} object can be passed in the parsing context - * to better control the parsing process. - * - * @author jnio...@digitalpebble.com - */ -public class RFC822Parser extends AbstractParser { - /** - * Serial version UID - */ - private static final long serialVersionUID = -5504243905998074168L; - - private static final Set<MediaType> SUPPORTED_TYPES = Collections - .singleton(MediaType.parse("message/rfc822")); - - public Set<MediaType> getSupportedTypes(ParseContext context) { - return SUPPORTED_TYPES; - } - - public void parse(InputStream stream, ContentHandler handler, - Metadata metadata, ParseContext context) throws IOException, - SAXException, TikaException { - // Get the mime4j configuration, or use a default one - MimeConfig config = new MimeConfig(); - config.setMaxLineLen(100000); - config.setMaxHeaderLen(100000); // max length of any individual header - config = context.get(MimeConfig.class, config); - - MimeStreamParser parser = new MimeStreamParser(config); - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); - - MailContentHandler mch = new MailContentHandler( - xhtml, metadata, context, config.isStrictParsing()); - parser.setContentHandler(mch); - parser.setContentDecoding(true); - - TikaInputStream tstream = TikaInputStream.get(stream); - try { - parser.parse(tstream); - } catch (IOException e) { - tstream.throwIfCauseOf(e); - throw new TikaException("Failed to parse an email message", e); - } catch (MimeException e) { - // Unwrap the exception in case it was not thrown by mime4j - Throwable cause = e.getCause(); - if (cause instanceof TikaException) { - throw (TikaException) cause; - } else if (cause instanceof SAXException) { - throw (SAXException) cause; - } else { - throw new TikaException("Failed to parse an email message", e); - } - } - } - -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mail; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; + +import org.apache.james.mime4j.MimeException; +import org.apache.james.mime4j.parser.MimeStreamParser; +import org.apache.james.mime4j.stream.MimeConfig; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Uses apache-mime4j to parse emails. Each part is treated with the + * corresponding parser and displayed within elements. + * <p/> + * A {@link MimeEntityConfig} object can be passed in the parsing context + * to better control the parsing process. + * + * @author jnio...@digitalpebble.com + */ +public class RFC822Parser extends AbstractParser { + /** + * Serial version UID + */ + private static final long serialVersionUID = -5504243905998074168L; + + private static final Set<MediaType> SUPPORTED_TYPES = Collections + .singleton(MediaType.parse("message/rfc822")); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + // Get the mime4j configuration, or use a default one + MimeConfig config = new MimeConfig(); + config.setMaxLineLen(100000); + config.setMaxHeaderLen(100000); // max length of any individual header + config = context.get(MimeConfig.class, config); + + MimeStreamParser parser = new MimeStreamParser(config); + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + + MailContentHandler mch = new MailContentHandler( + xhtml, metadata, context, config.isStrictParsing()); + parser.setContentHandler(mch); + parser.setContentDecoding(true); + + TikaInputStream tstream = TikaInputStream.get(stream); + try { + parser.parse(tstream); + } catch (IOException e) { + tstream.throwIfCauseOf(e); + throw new TikaException("Failed to parse an email message", e); + } catch (MimeException e) { + // Unwrap the exception in case it was not thrown by mime4j + Throwable cause = e.getCause(); + if (cause instanceof TikaException) { + throw (TikaException) cause; + } else if (cause instanceof SAXException) { + throw (SAXException) cause; + } else { + throw new TikaException("Failed to parse an email message", e); + } + } + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java index 5be4b0b..cc10dd2 100644 --- a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java +++ b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java @@ -1,75 +1,75 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.feed; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; - -import java.io.InputStream; - -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.sax.BodyContentHandler; -import org.junit.Test; -import org.xml.sax.ContentHandler; - -public class FeedParserTest { - @Test - public void testRSSParser() throws Exception { - try (InputStream input = FeedParserTest.class.getResourceAsStream( - "/test-documents/rsstest.rss")) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - ParseContext context = new ParseContext(); - - new FeedParser().parse(input, handler, metadata, context); - - String content = handler.toString(); - assertFalse(content == null); - - assertEquals("Sample RSS File for Junit test", - metadata.get(TikaCoreProperties.DESCRIPTION)); - assertEquals("TestChannel", metadata.get(TikaCoreProperties.TITLE)); - - // TODO find a way of testing the paragraphs and anchors - } - } - - - @Test - public void testAtomParser() throws Exception { - try (InputStream input = FeedParserTest.class.getResourceAsStream( - "/test-documents/testATOM.atom")) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - ParseContext context = new ParseContext(); - - new FeedParser().parse(input, handler, metadata, context); - - String content = handler.toString(); - assertFalse(content == null); - - assertEquals("Sample Atom File for Junit test", - metadata.get(TikaCoreProperties.DESCRIPTION)); - assertEquals("Test Atom Feed", metadata.get(TikaCoreProperties.TITLE)); - - // TODO Check some more - } - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.feed; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class FeedParserTest { + @Test + public void testRSSParser() throws Exception { + try (InputStream input = FeedParserTest.class.getResourceAsStream( + "/test-documents/rsstest.rss")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + ParseContext context = new ParseContext(); + + new FeedParser().parse(input, handler, metadata, context); + + String content = handler.toString(); + assertFalse(content == null); + + assertEquals("Sample RSS File for Junit test", + metadata.get(TikaCoreProperties.DESCRIPTION)); + assertEquals("TestChannel", metadata.get(TikaCoreProperties.TITLE)); + + // TODO find a way of testing the paragraphs and anchors + } + } + + + @Test + public void testAtomParser() throws Exception { + try (InputStream input = FeedParserTest.class.getResourceAsStream( + "/test-documents/testATOM.atom")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + ParseContext context = new ParseContext(); + + new FeedParser().parse(input, handler, metadata, context); + + String content = handler.toString(); + assertFalse(content == null); + + assertEquals("Sample Atom File for Junit test", + metadata.get(TikaCoreProperties.DESCRIPTION)); + assertEquals("Test Atom Feed", metadata.get(TikaCoreProperties.TITLE)); + + // TODO Check some more + } + } + +}