Author: jukka
Date: Sun Aug 21 17:34:17 2011
New Revision: 1160018

URL: http://svn.apache.org/viewvc?rev=1160018&view=rev
Log:
TIKA-667: Changes to RFC822Parser to support turning off strict parsing

Patch by Mark Butler

Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java?rev=1160018&r1=1160017&r2=1160018&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
 Sun Aug 21 17:34:17 2011
@@ -25,6 +25,7 @@ import org.apache.james.mime4j.field.Abs
 import org.apache.james.mime4j.field.AddressListField;
 import org.apache.james.mime4j.field.DateTimeField;
 import org.apache.james.mime4j.field.MailboxListField;
+import org.apache.james.mime4j.field.ParsedField;
 import org.apache.james.mime4j.field.UnstructuredField;
 import org.apache.james.mime4j.field.address.AddressList;
 import org.apache.james.mime4j.field.address.MailboxList;
@@ -45,14 +46,17 @@ import org.xml.sax.SAXException;
  */
 class MailContentHandler implements ContentHandler {
 
+    private boolean strictParsing = false;
+
     private XHTMLContentHandler handler;
     private Metadata metadata;
 
     private boolean inPart = false;
-
-    MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+    
+    MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata, boolean 
strictParsing) {
         this.handler = xhtml;
         this.metadata = metadata;
+        this.strictParsing = strictParsing;
     }
 
     public void body(BodyDescriptor body, InputStream is) throws MimeException,
@@ -127,80 +131,65 @@ class MailContentHandler implements Cont
             return;
         }
 
-        String fieldname = field.getName();
-        if (fieldname.equalsIgnoreCase("From")) {
-            MailboxListField fromField =
-                (MailboxListField) AbstractField.parse(field.getRaw());
-            MailboxList mailboxList = fromField.getMailboxList();
-            if (fromField.isValidField() && mailboxList != null) {
-                for (int i = 0; i < mailboxList.size(); i++) {
-                    String from = mailboxList.get(i).getDisplayString();
+        try {
+            String fieldname = field.getName();
+            ParsedField parsedField = AbstractField.parse(field.getRaw());
+            if (fieldname.equalsIgnoreCase("From")) {
+                MailboxListField fromField = (MailboxListField) parsedField;
+                MailboxList mailboxList = fromField.getMailboxList();
+                if (fromField.isValidField() && mailboxList != null) {
+                    for (int i = 0; i < mailboxList.size(); i++) {
+                        String from = mailboxList.get(i).getDisplayString();
+                        metadata.add(Metadata.MESSAGE_FROM, from);
+                        metadata.add(Metadata.AUTHOR, from);
+                    }
+                } else {
+                    String from = stripOutFieldPrefix(field, "From:");
+                    if (from.startsWith("<")) {
+                        from = from.substring(1);
+                    }
+                    if (from.endsWith(">")) {
+                        from = from.substring(0, from.length() - 1);
+                    }
                     metadata.add(Metadata.MESSAGE_FROM, from);
                     metadata.add(Metadata.AUTHOR, from);
                 }
-            } else {
-                String from =
-                    stripOutFieldPrefix(field.getRaw().toString(), "From:");
-                if (from.startsWith("<")) {
-                    from = from.substring(1);
-                }
-                if (from.endsWith(">")) {
-                    from = from.substring(0, from.length() - 1);
-                }
-                metadata.add(Metadata.MESSAGE_FROM, from);
-                metadata.add(Metadata.AUTHOR, from);
+            } else if (fieldname.equalsIgnoreCase("Subject")) {
+                metadata.add(Metadata.SUBJECT,
+                        ((UnstructuredField) parsedField).getValue());
+            } else if (fieldname.equalsIgnoreCase("To")) {
+                processAddressList(parsedField, "To:", Metadata.MESSAGE_TO);
+            } else if (fieldname.equalsIgnoreCase("CC")) {
+                processAddressList(parsedField, "Cc:", Metadata.MESSAGE_CC);
+            } else if (fieldname.equalsIgnoreCase("BCC")) {
+                processAddressList(parsedField, "Bcc:", Metadata.MESSAGE_BCC);
+            } else if (fieldname.equalsIgnoreCase("Date")) {
+                DateTimeField dateField = (DateTimeField) parsedField;
+                metadata.set(Metadata.DATE, dateField.getDate());
+                metadata.set(Metadata.CREATION_DATE, dateField.getDate());
             }
-        } else if (fieldname.equalsIgnoreCase("Subject")) {
-            UnstructuredField subjectField =
-                (UnstructuredField) AbstractField.parse(field.getRaw());
-            metadata.add(Metadata.SUBJECT, subjectField.getValue());
-        } else if (fieldname.equalsIgnoreCase("To")) {
-            AddressListField toField =
-                (AddressListField) AbstractField.parse(field.getRaw());
-            if (toField.isValidField()) {
-                AddressList addressList = toField.getAddressList();
-                for (int i = 0; i < addressList.size(); ++i) {
-                    metadata.add(Metadata.MESSAGE_TO, 
addressList.get(i).getDisplayString());
-                }
-            } else {
-                String to = stripOutFieldPrefix(field.getRaw().toString(), 
"To:");
-                for (String eachTo : to.split(",")) {
-                    metadata.add(Metadata.MESSAGE_TO, eachTo.trim());
-                }
+        } catch (RuntimeException me) {
+            if (strictParsing) {
+                throw me;
             }
-        } else if (fieldname.equalsIgnoreCase("CC")) {
-            AddressListField ccField =
-                (AddressListField) AbstractField.parse(field.getRaw());
-            if (ccField.isValidField()) {
-                AddressList addressList = ccField.getAddressList();
-                for (int i = 0; i < addressList.size(); ++i) {
-                    metadata.add(Metadata.MESSAGE_CC, 
addressList.get(i).getDisplayString());
-                }
-            } else {
-                String Cc = stripOutFieldPrefix(field.getRaw().toString(), 
"Cc:");
-                for (String eachCc : Cc.split(",")) {
-                    metadata.add(Metadata.MESSAGE_CC, eachCc.trim());
-                }
+        }
+    }
+
+    private void processAddressList(ParsedField field, String addressListType,
+            String metadataField) throws MimeException {
+        AddressListField toField = (AddressListField) field;
+        if (toField.isValidField()) {
+            AddressList addressList = toField.getAddressList();
+            for (int i = 0; i < addressList.size(); ++i) {
+                metadata.add(metadataField, addressList.get(i)
+                        .getDisplayString());
             }
-        } else if (fieldname.equalsIgnoreCase("BCC")) {
-            AddressListField bccField =
-                (AddressListField) AbstractField.parse(field.getRaw());
-            if(bccField.isValidField()){
-                AddressList addressList = bccField.getAddressList();
-                for (int i = 0; i < addressList.size(); ++i) {
-                    metadata.add(Metadata.MESSAGE_BCC, 
addressList.get(i).getDisplayString());
-                }
-            } else {
-                String Bcc = stripOutFieldPrefix(field.getRaw().toString(), 
"Bcc:");
-                for(String eachBcc : Bcc.split(",")){
-                    metadata.add(Metadata.MESSAGE_CC, eachBcc.trim());
-                }
+        } else {
+            String to = stripOutFieldPrefix(field,
+                    addressListType);
+            for (String eachTo : to.split(",")) {
+                metadata.add(metadataField, eachTo.trim());
             }
-        }  else if (fieldname.equalsIgnoreCase("Date")) {
-            DateTimeField dateField =
-                (DateTimeField) AbstractField.parse(field.getRaw());
-            metadata.set(Metadata.DATE, dateField.getDate());
-            metadata.set(Metadata.CREATION_DATE, dateField.getDate());
         }
     }
 
@@ -228,12 +217,13 @@ class MailContentHandler implements Cont
         inPart = true;
     }
 
-    public String stripOutFieldPrefix(String rawField, String fieldname){
-        String temp = rawField.substring(fieldname.length(), 
rawField.length());
-        while (temp.startsWith(" ")) {
-            temp = temp.substring(1);
+    private String stripOutFieldPrefix(Field field, String fieldname) {
+        String temp = field.getRaw().toString();
+        int loc = fieldname.length();
+        while (temp.charAt(loc) ==' ') {
+            loc++;
         }
-        return temp;
+        return temp.substring(loc);
     }
 
 }
\ No newline at end of file

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java?rev=1160018&r1=1160017&r2=1160018&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
 Sun Aug 21 17:34:17 2011
@@ -63,7 +63,8 @@ public class RFC822Parser extends Abstra
         MimeStreamParser parser = new MimeStreamParser(config);
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
 
-        MailContentHandler mch = new MailContentHandler(xhtml, metadata);
+        MailContentHandler mch = new MailContentHandler(
+                xhtml, metadata, config.isStrictParsing());
         parser.setContentHandler(mch);
         parser.setContentDecoding(true);
         TaggedInputStream tagged = TaggedInputStream.get(stream);


Reply via email to