Author: wkasper
Date: Mon Nov 7 09:19:25 2011
New Revision: 1198669
URL: http://svn.apache.org/viewvc?rev=1198669&view=rev
Log:
Stanbol-368: Added a "simple" extractor for mail files fixing some shortcomings
of the simple Aperture mail extractor
Added:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractor.java
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractorFactory.java
Modified:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/extractionregistry.xml
Added:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractor.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractor.java?rev=1198669&view=auto
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractor.java
(added)
+++
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractor.java
Mon Nov 7 09:19:25 2011
@@ -0,0 +1,392 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stanbol.enhancer.engines.metaxa.core.mail.simple;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.List;
+
+import javax.activation.DataHandler;
+import javax.mail.Address;
+import javax.mail.BodyPart;
+import javax.mail.Message.RecipientType;
+import javax.mail.MessagingException;
+import javax.mail.Multipart;
+import javax.mail.Part;
+import javax.mail.internet.AddressException;
+import javax.mail.internet.ContentType;
+import javax.mail.internet.InternetAddress;
+import javax.mail.internet.MimeMessage;
+import javax.mail.internet.MimeUtility;
+
+import
org.apache.stanbol.enhancer.engines.metaxa.core.html.HtmlTextExtractUtil;
+import
org.apache.stanbol.enhancer.engines.metaxa.core.html.InitializationException;
+import org.ontoware.rdf2go.exception.ModelException;
+import org.ontoware.rdf2go.model.Model;
+import org.ontoware.rdf2go.model.Syntax;
+import org.ontoware.rdf2go.model.impl.URIGenerator;
+import org.ontoware.rdf2go.model.node.URI;
+import org.ontoware.rdf2go.model.node.impl.URIImpl;
+import org.ontoware.rdf2go.vocabulary.RDF;
+import org.semanticdesktop.aperture.extractor.Extractor;
+import org.semanticdesktop.aperture.extractor.ExtractorException;
+import org.semanticdesktop.aperture.extractor.mime.MailUtil;
+import org.semanticdesktop.aperture.rdf.RDFContainer;
+import org.semanticdesktop.aperture.rdf.RDFContainerFactory;
+import org.semanticdesktop.aperture.rdf.impl.RDFContainerFactoryImpl;
+import org.semanticdesktop.aperture.vocabulary.NFO;
+import org.semanticdesktop.aperture.vocabulary.NIE;
+import org.semanticdesktop.aperture.vocabulary.NMO;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An Extractor implementation for message/rfc822-style messages.
+ *
+ * <p>
+ * Only typical body parts are processed during full-text extraction.
Attachments are only listed but not
+ * further handled. In case of mails in HTML format, the full HTML is included
in the extracted data as value
+ * of the <code>nmo:htmlMessageContent</code> property. The plain text
(extract) is represented by the
+ * <code>nmo:plainTextMessageContent</code> property and as value of the
<code>nie:plainTextContent</code>
+ * property for compliance with the representation from other extractors.
+ *
+ *
+ */
+public class SimpleMailExtractor implements Extractor {
+ private static final Logger logger =
LoggerFactory.getLogger(SimpleMailExtractor.class);
+
+ public void extract(URI id, InputStream stream, Charset charset, String
mimeType, RDFContainer result) throws ExtractorException {
+ try {
+ // parse the stream
+ MimeMessage message = new MimeMessage(null, stream);
+
+ result.add(RDF.type, NMO.Email);
+
+ // extract the full-text
+ StringBuilder buffer = new StringBuilder(10000);
+ processMessage(message, buffer, result);
+ String text = buffer.toString().trim();
+ if (text.length() > 0) {
+ result.add(NMO.plainTextMessageContent, text);
+ result.add(NIE.plainTextContent, text);
+ }
+
+ // extract other metadata
+ String title = message.getSubject();
+ if (title != null) {
+ title = title.trim();
+ if (title.length() > 0) {
+ result.add(NMO.messageSubject, title);
+ }
+ }
+
+ try {
+ copyAddress(message.getFrom(), NMO.from, result);
+ } catch (AddressException e) {
+ // ignore
+ }
+
+ copyAddress(getRecipients(message, RecipientType.TO), NMO.to,
result);
+ copyAddress(getRecipients(message, RecipientType.CC), NMO.cc,
result);
+ copyAddress(getRecipients(message, RecipientType.BCC), NMO.bcc,
result);
+
+ MailUtil.getDates(message, result);
+
+ } catch (MessagingException e) {
+ throw new ExtractorException(e);
+ } catch (IOException e) {
+ throw new ExtractorException(e);
+ }
+ }
+
+ // the top level message
+ protected void processMessage(MimeMessage msg, StringBuilder buffer,
RDFContainer rdf) throws MessagingException,
+
IOException,
+
ExtractorException {
+ if (msg.isMimeType("text/plain")) {
+ processContent(msg.getContent(), buffer, rdf);
+ } else if (msg.isMimeType("text/html")) {
+ String encoding = getContentEncoding(new
ContentType(msg.getContentType()));
+ logger.debug("HTML encoding: {}", encoding);
+ if (msg.getContent() instanceof String) {
+ String text = extractTextFromHtml(((String)
msg.getContent()).trim(), encoding, rdf);
+ rdf.add(NMO.htmlMessageContent, (String) msg.getContent());
+ processContent(text, buffer, rdf);
+ } else {
+ processContent(msg.getContent(), buffer, rdf);
+ }
+ } else {
+ processContent(msg.getContent(), buffer, rdf);
+ }
+ }
+
+ // the recursive part
+ protected void processContent(Object content, StringBuilder buffer,
RDFContainer rdf) throws MessagingException,
+
IOException,
+
ExtractorException {
+ if (content instanceof String) {
+ buffer.append(content);
+ buffer.append(' ');
+ } else if (content instanceof BodyPart) {
+ BodyPart bodyPart = (BodyPart) content;
+ DataHandler handler = bodyPart.getDataHandler();
+ String encoding = null;
+ if (handler != null) {
+ encoding = MimeUtility.getEncoding(handler);
+ }
+ String fileName = bodyPart.getFileName();
+ String contentType = bodyPart.getContentType();
+ if (fileName != null) {
+ try {
+ fileName = MimeUtility.decodeWord(fileName);
+ } catch (MessagingException e) {
+ // happens on unencoded file names! so just ignore it and
leave the file name as it is
+ }
+ URI attachURI = URIGenerator.createNewRandomUniqueURI();
+ rdf.add(NMO.hasAttachment, attachURI);
+ Model m = rdf.getModel();
+ m.addStatement(attachURI, RDF.type, NFO.Attachment);
+ m.addStatement(attachURI, NFO.fileName, fileName);
+ if (handler != null) {
+ if (encoding != null) {
+ m.addStatement(attachURI, NFO.encoding, encoding);
+ }
+ }
+ if (contentType != null) {
+ contentType = (new ContentType(contentType)).getBaseType();
+ m.addStatement(attachURI, NIE.mimeType,
contentType.trim());
+ }
+ // TODO: encoding?
+ }
+
+ // append the content, if any
+ content = bodyPart.getContent();
+
+ // remove any html markup if necessary
+ if (contentType != null && content instanceof String) {
+ contentType = contentType.toLowerCase();
+ if (contentType.indexOf("text/html") >= 0) {
+ if (encoding != null) {
+ encoding = MimeUtility.javaCharset(encoding);
+ }
+ content = extractTextFromHtml((String) content, encoding,
rdf);
+ }
+ }
+
+ processContent(content, buffer, rdf);
+ } else if (content instanceof Multipart) {
+ Multipart multipart = (Multipart) content;
+ String subType = null;
+
+ String contentType = multipart.getContentType();
+ if (contentType != null) {
+ ContentType ct = new ContentType(contentType);
+ subType = ct.getSubType();
+ if (subType != null) {
+ subType = subType.trim().toLowerCase();
+ }
+ }
+
+ if ("alternative".equals(subType)) {
+ handleAlternativePart(multipart, buffer, rdf);
+ } else if ("signed".equals(subType)) {
+ handleProtectedPart(multipart, 0, buffer, rdf);
+ } else if ("encrypted".equals(subType)) {
+ handleProtectedPart(multipart, 1, buffer, rdf);
+ } else {
+ // handles multipart/mixed, /digest, /related, /parallel,
/report and unknown subtypes
+ handleMixedPart(multipart, buffer, rdf);
+ }
+ }
+ }
+
+ protected void handleAlternativePart(Multipart multipart, StringBuilder
buffer, RDFContainer rdf) throws MessagingException,
+
IOException,
+
ExtractorException {
+ // find the first text/plain part or else the first text/html part
+ boolean isHtml = false;
+
+ int idx = getPartWithMimeType(multipart, "text/plain");
+ int idxh = getPartWithMimeType(multipart, "text/html");
+ if (idx < 0) {
+ isHtml = true;
+ }
+ // add nmo:htmlMessageContent property
+ if (idxh >= 0) {
+ Object html = multipart.getBodyPart(idxh).getContent();
+ if (html != null && html instanceof String) {
+ rdf.add(NMO.htmlMessageContent, (String) html);
+ }
+ }
+ if (idx >= 0) {
+ Object content = multipart.getBodyPart(idx).getContent();
+ if (content != null) {
+ if (content instanceof String && isHtml) {
+ String encoding = getEncoding(multipart.getBodyPart(idx));
+ if (encoding != null) {
+ encoding = MimeUtility.javaCharset(encoding);
+ }
+ content = extractTextFromHtml((String) content, encoding,
rdf);
+ }
+
+ processContent(content, buffer, rdf);
+ }
+ }
+ }
+
+ protected void handleMixedPart(Multipart multipart, StringBuilder buffer,
RDFContainer rdf) throws MessagingException,
+
IOException,
+
ExtractorException {
+ int count = multipart.getCount();
+ for (int i = 0; i < count; i++) {
+ processContent(multipart.getBodyPart(i), buffer, rdf);
+ }
+ }
+
+ protected void handleProtectedPart(Multipart multipart, int index,
StringBuilder buffer, RDFContainer rdf) throws MessagingException,
+
IOException,
+
ExtractorException {
+ if (index < multipart.getCount()) {
+ processContent(multipart.getBodyPart(index), buffer, rdf);
+ }
+ }
+
+ protected int getPartWithMimeType(Multipart multipart, String mimeType)
throws MessagingException {
+ for (int i = 0; i < multipart.getCount(); i++) {
+ BodyPart bodyPart = multipart.getBodyPart(i);
+ if (mimeType.equalsIgnoreCase(getMimeType(bodyPart))) {
+ return i;
+ }
+ }
+
+ return -1;
+ }
+
+ protected String getContentEncoding(ContentType contentType) {
+ if (contentType != null) {
+ return contentType.getParameter("charset");
+ }
+ return null;
+ }
+
+ protected String getEncoding(Part mailPart) throws MessagingException {
+ DataHandler handler = mailPart.getDataHandler();
+ if (handler != null) {
+ return MimeUtility.getEncoding(handler);
+ }
+ return null;
+ }
+
+ protected String getMimeType(Part mailPart) throws MessagingException {
+ String contentType = mailPart.getContentType();
+ if (contentType != null) {
+ ContentType ct = new ContentType(contentType);
+ return ct.getBaseType();
+ }
+
+ return null;
+ }
+
+ protected String extractTextFromHtml(String string, String charset,
RDFContainer rdf) throws ExtractorException {
+ // parse the HTML and extract full-text and metadata
+ HtmlTextExtractUtil extractor;
+ try {
+ extractor = new HtmlTextExtractUtil();
+ } catch (InitializationException e) {
+ throw new ExtractorException("Could not initialize HtmlExtractor:
" + e.getMessage());
+ }
+ InputStream stream = new ByteArrayInputStream(string.getBytes());
+ RDFContainerFactory containerFactory = new RDFContainerFactoryImpl();
+ URI id = rdf.getDescribedUri();
+ RDFContainer result = containerFactory.getRDFContainer(id);
+ extractor.extract(id, charset, stream, result);
+ Model meta = result.getModel();
+
+ // append metadata and full-text to a string buffer
+ StringBuilder buffer = new StringBuilder(32 * 1024);
+ append(buffer, extractor.getTitle(meta), "\n");
+ append(buffer, extractor.getAuthor(meta), "\n");
+ append(buffer, extractor.getDescription(meta), "\n");
+ List<String> keywords = extractor.getKeywords(meta);
+ for (String kw : keywords) {
+ append(buffer, kw, " ");
+ }
+ buffer.append("\n");
+ append(buffer, extractor.getText(meta), " ");
+ logger.debug("text extracted:\n{}", buffer);
+ meta.close();
+
+ // return the buffer's content
+ return buffer.toString();
+ }
+
+ protected void append(StringBuilder buffer, String text, String sep) {
+ if (text != null) {
+ buffer.append(text);
+ buffer.append(sep);
+ }
+ }
+
+ protected Address[] getRecipients(MimeMessage message, RecipientType type)
throws MessagingException {
+ Address[] result = null;
+
+ try {
+ result = message.getRecipients(type);
+ } catch (AddressException e) {
+ // ignore
+ }
+
+ return result;
+ }
+
+ protected void copyAddress(Object address, URI predicate, RDFContainer
result) {
+ try {
+ if (address instanceof InternetAddress) {
+ MailUtil.addAddressMetadata((InternetAddress) address,
predicate, result);
+ } else if (address instanceof InternetAddress[]) {
+ InternetAddress[] array = (InternetAddress[]) address;
+ for (int i = 0; i < array.length; i++) {
+ MailUtil.addAddressMetadata(array[i], predicate, result);
+ }
+ }
+ } catch (ModelException e) {
+ logger.error("ModelException while adding address metadata", e);
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ int argv = 0;
+ SimpleMailExtractor extractor = new SimpleMailExtractor();
+
+ RDFContainerFactory rdfFactory = new RDFContainerFactoryImpl();
+ for (int i = argv; i < args.length; ++i) {
+ File file = new File(args[i]);
+ InputStream in = new FileInputStream(file);
+ URI uri = new URIImpl(file.toURI().toString());
+ RDFContainer rdfContainer = rdfFactory.getRDFContainer(uri);
+ extractor.extract(uri, in, null, null, rdfContainer);
+ Model model = rdfContainer.getModel();
+ model.writeTo(System.out, Syntax.RdfXml);
+ model.close();
+ }
+ }
+}
\ No newline at end of file
Added:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractorFactory.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractorFactory.java?rev=1198669&view=auto
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractorFactory.java
(added)
+++
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractorFactory.java
Mon Nov 7 09:19:25 2011
@@ -0,0 +1,47 @@
+/*
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stanbol.enhancer.engines.metaxa.core.mail.simple;
+
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.semanticdesktop.aperture.extractor.Extractor;
+import org.semanticdesktop.aperture.extractor.ExtractorFactory;
+
+public class SimpleMailExtractorFactory implements ExtractorFactory {
+
+ private static final Set MIME_TYPES;
+
+ static {
+ HashSet set = new HashSet();
+ set.add("message/rfc822");
+ set.add("message/news");
+
+ MIME_TYPES = Collections.unmodifiableSet(set);
+ }
+
+ public Extractor get() {
+ return new SimpleMailExtractor();
+ }
+
+ public Set getSupportedMimeTypes() {
+ return MIME_TYPES;
+ }
+}
Modified:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/extractionregistry.xml
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/extractionregistry.xml?rev=1198669&r1=1198668&r2=1198669&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/extractionregistry.xml
(original)
+++
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/extractionregistry.xml
Mon Nov 7 09:19:25 2011
@@ -74,6 +74,6 @@
<name>org.apache.stanbol.enhancer.engines.metaxa.core.mp3.MP3FileExtractorFactory</name>
</extractorFactory>
<extractorFactory>
-
<name>org.semanticdesktop.aperture.extractor.mime.MimeExtractorFactory</name>
+
<name>org.apache.stanbol.enhancer.engines.metaxa.core.mail.simple.SimpleMailExtractorFactory</name>
</extractorFactory>
</extractorFactories>