svn commit: r1198669 - in /incubator/stanbol/trunk/enhancer/engines/metaxa/src/main: java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/ java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/ resources/

wkasper Mon, 07 Nov 2011 01:19:55 -0800

Author: wkasper
Date: Mon Nov  7 09:19:25 2011
New Revision: 1198669

URL: http://svn.apache.org/viewvc?rev=1198669&view=rev
Log:
Stanbol-368: Added a "simple" extractor for mail files fixing some shortcomings 
of the simple Aperture mail extractor


Added:
    
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/
    
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/
    
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractor.java
    
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractorFactory.java
Modified:
    
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/extractionregistry.xml

Added: 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractor.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractor.java?rev=1198669&view=auto
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractor.java
 (added)
+++ 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractor.java
 Mon Nov  7 09:19:25 2011
@@ -0,0 +1,392 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stanbol.enhancer.engines.metaxa.core.mail.simple;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.List;
+
+import javax.activation.DataHandler;
+import javax.mail.Address;
+import javax.mail.BodyPart;
+import javax.mail.Message.RecipientType;
+import javax.mail.MessagingException;
+import javax.mail.Multipart;
+import javax.mail.Part;
+import javax.mail.internet.AddressException;
+import javax.mail.internet.ContentType;
+import javax.mail.internet.InternetAddress;
+import javax.mail.internet.MimeMessage;
+import javax.mail.internet.MimeUtility;
+
+import 
org.apache.stanbol.enhancer.engines.metaxa.core.html.HtmlTextExtractUtil;
+import 
org.apache.stanbol.enhancer.engines.metaxa.core.html.InitializationException;
+import org.ontoware.rdf2go.exception.ModelException;
+import org.ontoware.rdf2go.model.Model;
+import org.ontoware.rdf2go.model.Syntax;
+import org.ontoware.rdf2go.model.impl.URIGenerator;
+import org.ontoware.rdf2go.model.node.URI;
+import org.ontoware.rdf2go.model.node.impl.URIImpl;
+import org.ontoware.rdf2go.vocabulary.RDF;
+import org.semanticdesktop.aperture.extractor.Extractor;
+import org.semanticdesktop.aperture.extractor.ExtractorException;
+import org.semanticdesktop.aperture.extractor.mime.MailUtil;
+import org.semanticdesktop.aperture.rdf.RDFContainer;
+import org.semanticdesktop.aperture.rdf.RDFContainerFactory;
+import org.semanticdesktop.aperture.rdf.impl.RDFContainerFactoryImpl;
+import org.semanticdesktop.aperture.vocabulary.NFO;
+import org.semanticdesktop.aperture.vocabulary.NIE;
+import org.semanticdesktop.aperture.vocabulary.NMO;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An Extractor implementation for message/rfc822-style messages.
+ * 
+ * <p>
+ * Only typical body parts are processed during full-text extraction. 
Attachments are only listed but not
+ * further handled. In case of mails in HTML format, the full HTML is included 
in the extracted data as value
+ * of the <code>nmo:htmlMessageContent</code> property. The plain text 
(extract) is represented by the
+ * <code>nmo:plainTextMessageContent</code> property and as value of the 
<code>nie:plainTextContent</code>
+ * property for compliance with the representation from other extractors.
+ * 
+ * 
+ */
+public class SimpleMailExtractor implements Extractor {
+    private static final Logger logger = 
LoggerFactory.getLogger(SimpleMailExtractor.class);
+    
+    public void extract(URI id, InputStream stream, Charset charset, String 
mimeType, RDFContainer result) throws ExtractorException {
+        try {
+            // parse the stream
+            MimeMessage message = new MimeMessage(null, stream);
+            
+            result.add(RDF.type, NMO.Email);
+            
+            // extract the full-text
+            StringBuilder buffer = new StringBuilder(10000);
+            processMessage(message, buffer, result);
+            String text = buffer.toString().trim();
+            if (text.length() > 0) {
+                result.add(NMO.plainTextMessageContent, text);
+                result.add(NIE.plainTextContent, text);
+            }
+            
+            // extract other metadata
+            String title = message.getSubject();
+            if (title != null) {
+                title = title.trim();
+                if (title.length() > 0) {
+                    result.add(NMO.messageSubject, title);
+                }
+            }
+            
+            try {
+                copyAddress(message.getFrom(), NMO.from, result);
+            } catch (AddressException e) {
+                // ignore
+            }
+            
+            copyAddress(getRecipients(message, RecipientType.TO), NMO.to, 
result);
+            copyAddress(getRecipients(message, RecipientType.CC), NMO.cc, 
result);
+            copyAddress(getRecipients(message, RecipientType.BCC), NMO.bcc, 
result);
+            
+            MailUtil.getDates(message, result);
+            
+        } catch (MessagingException e) {
+            throw new ExtractorException(e);
+        } catch (IOException e) {
+            throw new ExtractorException(e);
+        }
+    }
+    
+    // the top level message
+    protected void processMessage(MimeMessage msg, StringBuilder buffer, 
RDFContainer rdf) throws MessagingException,
+                                                                               
           IOException,
+                                                                               
           ExtractorException {
+        if (msg.isMimeType("text/plain")) {
+            processContent(msg.getContent(), buffer, rdf);
+        } else if (msg.isMimeType("text/html")) {
+            String encoding = getContentEncoding(new 
ContentType(msg.getContentType()));
+            logger.debug("HTML encoding: {}", encoding);
+            if (msg.getContent() instanceof String) {
+                String text = extractTextFromHtml(((String) 
msg.getContent()).trim(), encoding, rdf);
+                rdf.add(NMO.htmlMessageContent, (String) msg.getContent());
+                processContent(text, buffer, rdf);
+            } else {
+                processContent(msg.getContent(), buffer, rdf);
+            }
+        } else {
+            processContent(msg.getContent(), buffer, rdf);
+        }
+    }
+    
+    // the recursive part
+    protected void processContent(Object content, StringBuilder buffer, 
RDFContainer rdf) throws MessagingException,
+                                                                               
          IOException,
+                                                                               
          ExtractorException {
+        if (content instanceof String) {
+            buffer.append(content);
+            buffer.append(' ');
+        } else if (content instanceof BodyPart) {
+            BodyPart bodyPart = (BodyPart) content;
+            DataHandler handler = bodyPart.getDataHandler();
+            String encoding = null;
+            if (handler != null) {
+                encoding = MimeUtility.getEncoding(handler);
+            }
+            String fileName = bodyPart.getFileName();
+            String contentType = bodyPart.getContentType();
+            if (fileName != null) {
+                try {
+                    fileName = MimeUtility.decodeWord(fileName);
+                } catch (MessagingException e) {
+                    // happens on unencoded file names! so just ignore it and 
leave the file name as it is
+                }
+                URI attachURI = URIGenerator.createNewRandomUniqueURI();
+                rdf.add(NMO.hasAttachment, attachURI);
+                Model m = rdf.getModel();
+                m.addStatement(attachURI, RDF.type, NFO.Attachment);
+                m.addStatement(attachURI, NFO.fileName, fileName);
+                if (handler != null) {
+                    if (encoding != null) {
+                        m.addStatement(attachURI, NFO.encoding, encoding);
+                    }
+                }
+                if (contentType != null) {
+                    contentType = (new ContentType(contentType)).getBaseType();
+                    m.addStatement(attachURI, NIE.mimeType, 
contentType.trim());
+                }
+                // TODO: encoding?
+            }
+            
+            // append the content, if any
+            content = bodyPart.getContent();
+            
+            // remove any html markup if necessary
+            if (contentType != null && content instanceof String) {
+                contentType = contentType.toLowerCase();
+                if (contentType.indexOf("text/html") >= 0) {
+                    if (encoding != null) {
+                        encoding = MimeUtility.javaCharset(encoding);
+                    }
+                    content = extractTextFromHtml((String) content, encoding, 
rdf);
+                }
+            }
+            
+            processContent(content, buffer, rdf);
+        } else if (content instanceof Multipart) {
+            Multipart multipart = (Multipart) content;
+            String subType = null;
+            
+            String contentType = multipart.getContentType();
+            if (contentType != null) {
+                ContentType ct = new ContentType(contentType);
+                subType = ct.getSubType();
+                if (subType != null) {
+                    subType = subType.trim().toLowerCase();
+                }
+            }
+            
+            if ("alternative".equals(subType)) {
+                handleAlternativePart(multipart, buffer, rdf);
+            } else if ("signed".equals(subType)) {
+                handleProtectedPart(multipart, 0, buffer, rdf);
+            } else if ("encrypted".equals(subType)) {
+                handleProtectedPart(multipart, 1, buffer, rdf);
+            } else {
+                // handles multipart/mixed, /digest, /related, /parallel, 
/report and unknown subtypes
+                handleMixedPart(multipart, buffer, rdf);
+            }
+        }
+    }
+    
+    protected void handleAlternativePart(Multipart multipart, StringBuilder 
buffer, RDFContainer rdf) throws MessagingException,
+                                                                               
                      IOException,
+                                                                               
                      ExtractorException {
+        // find the first text/plain part or else the first text/html part
+        boolean isHtml = false;
+        
+        int idx = getPartWithMimeType(multipart, "text/plain");
+        int idxh = getPartWithMimeType(multipart, "text/html");
+        if (idx < 0) {
+            isHtml = true;
+        }
+        // add nmo:htmlMessageContent property
+        if (idxh >= 0) {
+            Object html = multipart.getBodyPart(idxh).getContent();
+            if (html != null && html instanceof String) {
+                rdf.add(NMO.htmlMessageContent, (String) html);
+            }
+        }
+        if (idx >= 0) {
+            Object content = multipart.getBodyPart(idx).getContent();
+            if (content != null) {
+                if (content instanceof String && isHtml) {
+                    String encoding = getEncoding(multipart.getBodyPart(idx));
+                    if (encoding != null) {
+                        encoding = MimeUtility.javaCharset(encoding);
+                    }
+                    content = extractTextFromHtml((String) content, encoding, 
rdf);
+                }
+                
+                processContent(content, buffer, rdf);
+            }
+        }
+    }
+    
+    protected void handleMixedPart(Multipart multipart, StringBuilder buffer, 
RDFContainer rdf) throws MessagingException,
+                                                                               
                IOException,
+                                                                               
                ExtractorException {
+        int count = multipart.getCount();
+        for (int i = 0; i < count; i++) {
+            processContent(multipart.getBodyPart(i), buffer, rdf);
+        }
+    }
+    
+    protected void handleProtectedPart(Multipart multipart, int index, 
StringBuilder buffer, RDFContainer rdf) throws MessagingException,
+                                                                               
                               IOException,
+                                                                               
                               ExtractorException {
+        if (index < multipart.getCount()) {
+            processContent(multipart.getBodyPart(index), buffer, rdf);
+        }
+    }
+    
+    protected int getPartWithMimeType(Multipart multipart, String mimeType) 
throws MessagingException {
+        for (int i = 0; i < multipart.getCount(); i++) {
+            BodyPart bodyPart = multipart.getBodyPart(i);
+            if (mimeType.equalsIgnoreCase(getMimeType(bodyPart))) {
+                return i;
+            }
+        }
+        
+        return -1;
+    }
+    
+    protected String getContentEncoding(ContentType contentType) {
+        if (contentType != null) {
+            return contentType.getParameter("charset");
+        }
+        return null;
+    }
+    
+    protected String getEncoding(Part mailPart) throws MessagingException {
+        DataHandler handler = mailPart.getDataHandler();
+        if (handler != null) {
+            return MimeUtility.getEncoding(handler);
+        }
+        return null;
+    }
+    
+    protected String getMimeType(Part mailPart) throws MessagingException {
+        String contentType = mailPart.getContentType();
+        if (contentType != null) {
+            ContentType ct = new ContentType(contentType);
+            return ct.getBaseType();
+        }
+        
+        return null;
+    }
+    
+    protected String extractTextFromHtml(String string, String charset, 
RDFContainer rdf) throws ExtractorException {
+        // parse the HTML and extract full-text and metadata
+        HtmlTextExtractUtil extractor;
+        try {
+            extractor = new HtmlTextExtractUtil();
+        } catch (InitializationException e) {
+            throw new ExtractorException("Could not initialize HtmlExtractor: 
" + e.getMessage());
+        }
+        InputStream stream = new ByteArrayInputStream(string.getBytes());
+        RDFContainerFactory containerFactory = new RDFContainerFactoryImpl();
+        URI id = rdf.getDescribedUri();
+        RDFContainer result = containerFactory.getRDFContainer(id);
+        extractor.extract(id, charset, stream, result);
+        Model meta = result.getModel();
+        
+        // append metadata and full-text to a string buffer
+        StringBuilder buffer = new StringBuilder(32 * 1024);
+        append(buffer, extractor.getTitle(meta), "\n");
+        append(buffer, extractor.getAuthor(meta), "\n");
+        append(buffer, extractor.getDescription(meta), "\n");
+        List<String> keywords = extractor.getKeywords(meta);
+        for (String kw : keywords) {
+            append(buffer, kw, " ");
+        }
+        buffer.append("\n");
+        append(buffer, extractor.getText(meta), " ");
+        logger.debug("text extracted:\n{}", buffer);
+        meta.close();
+        
+        // return the buffer's content
+        return buffer.toString();
+    }
+    
+    protected void append(StringBuilder buffer, String text, String sep) {
+        if (text != null) {
+            buffer.append(text);
+            buffer.append(sep);
+        }
+    }
+    
+    protected Address[] getRecipients(MimeMessage message, RecipientType type) 
throws MessagingException {
+        Address[] result = null;
+        
+        try {
+            result = message.getRecipients(type);
+        } catch (AddressException e) {
+            // ignore
+        }
+        
+        return result;
+    }
+    
+    protected void copyAddress(Object address, URI predicate, RDFContainer 
result) {
+        try {
+            if (address instanceof InternetAddress) {
+                MailUtil.addAddressMetadata((InternetAddress) address, 
predicate, result);
+            } else if (address instanceof InternetAddress[]) {
+                InternetAddress[] array = (InternetAddress[]) address;
+                for (int i = 0; i < array.length; i++) {
+                    MailUtil.addAddressMetadata(array[i], predicate, result);
+                }
+            }
+        } catch (ModelException e) {
+            logger.error("ModelException while adding address metadata", e);
+        }
+    }
+    
+    public static void main(String[] args) throws Exception {
+        int argv = 0;
+        SimpleMailExtractor extractor = new SimpleMailExtractor();
+        
+        RDFContainerFactory rdfFactory = new RDFContainerFactoryImpl();
+        for (int i = argv; i < args.length; ++i) {
+            File file = new File(args[i]);
+            InputStream in = new FileInputStream(file);
+            URI uri = new URIImpl(file.toURI().toString());
+            RDFContainer rdfContainer = rdfFactory.getRDFContainer(uri);
+            extractor.extract(uri, in, null, null, rdfContainer);
+            Model model = rdfContainer.getModel();
+            model.writeTo(System.out, Syntax.RdfXml);
+            model.close();
+        }
+    }
+}
\ No newline at end of file

Added: 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractorFactory.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractorFactory.java?rev=1198669&view=auto
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractorFactory.java
 (added)
+++ 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/SimpleMailExtractorFactory.java
 Mon Nov  7 09:19:25 2011
@@ -0,0 +1,47 @@
+/*
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stanbol.enhancer.engines.metaxa.core.mail.simple;
+
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.semanticdesktop.aperture.extractor.Extractor;
+import org.semanticdesktop.aperture.extractor.ExtractorFactory;
+
+public class SimpleMailExtractorFactory implements ExtractorFactory {
+    
+    private static final Set MIME_TYPES;
+    
+    static {
+        HashSet set = new HashSet();
+        set.add("message/rfc822");
+        set.add("message/news");
+        
+        MIME_TYPES = Collections.unmodifiableSet(set);
+    }
+    
+    public Extractor get() {
+        return new SimpleMailExtractor();
+    }
+    
+    public Set getSupportedMimeTypes() {
+        return MIME_TYPES;
+    }
+}

Modified: 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/extractionregistry.xml
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/extractionregistry.xml?rev=1198669&r1=1198668&r2=1198669&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/extractionregistry.xml
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/extractionregistry.xml
 Mon Nov  7 09:19:25 2011
@@ -74,6 +74,6 @@
                
<name>org.apache.stanbol.enhancer.engines.metaxa.core.mp3.MP3FileExtractorFactory</name>
        </extractorFactory>     
        <extractorFactory>
-               
<name>org.semanticdesktop.aperture.extractor.mime.MimeExtractorFactory</name>
+               
<name>org.apache.stanbol.enhancer.engines.metaxa.core.mail.simple.SimpleMailExtractorFactory</name>
        </extractorFactory>
 </extractorFactories>

svn commit: r1198669 - in /incubator/stanbol/trunk/enhancer/engines/metaxa/src/main: java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/ java/org/apache/stanbol/enhancer/engines/metaxa/core/mail/simple/ resources/

Reply via email to