ti...

bob Tue, 05 Jan 2016 19:51:50 -0800

Added: 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,808 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iptc;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Set;
+import java.util.TimeZone;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Parser for IPTC ANPA New Wire Feeds
+ */
+public class IptcAnpaParser implements Parser {
+    /** Serial version UID */
+    private static final long serialVersionUID = -6062820170212879115L;
+
+    private static final MediaType TYPE =
+        MediaType.text("vnd.iptc.anpa");
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.singleton(TYPE);
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+           InputStream stream, ContentHandler handler,
+           Metadata metadata, ParseContext context)
+           throws IOException, SAXException, TikaException {
+
+        HashMap<String,String> properties = this.loadProperties(stream);
+        this.setMetadata(metadata, properties);
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        // TODO: put body content here
+        xhtml.startElement("p");
+        String body = clean(properties.get("body"));
+        if (body != null)
+           xhtml.characters(body);
+        xhtml.endElement("p");
+        xhtml.endDocument();
+    }
+
+    /**
+     * @deprecated This method will be removed in Apache Tika 1.0.
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
+            throws IOException, SAXException, TikaException {
+        parse(stream, handler, metadata, new ParseContext());
+    }
+
+
+   private int FMT_ANPA_1312    = 0x00;   // "NAA 89-3 (ANPA 1312)"
+   private int FMT_ANPA_UPI     = 0x01;   // "United Press International ANPA 
1312 variant"
+   private int FMT_ANPA_UPI_DL  = 0x02;   // "United Press International 
Down-Load Message"
+   private int FMT_IPTC_7901    = 0x03;   // "IPTC7901 Recommended Message 
Format"
+   private int FMT_IPTC_PHOTO   = 0x04;   // "IPTC-NAA Digital Newsphoto 
Parameter Record"
+   private int FMT_IPTC_CHAR    = 0x05;   // "IPTC Unstructured Character 
Oriented File Format (UCOFF)"
+   private int FMT_NITF         = 0x06;   // "News Industry Text Format (NITF)"
+   private int FMT_NITF_TT      = 0x07;   // "Tidningarnas Telegrambyra NITF 
version (TTNITF DTD)"
+   private int FMT_NITF_RB      = 0x08;   // "Ritzaus Bureau NITF version 
(RBNITF DTD)"
+   private int FMT_IPTC_AP      = 0x09;   // "Associated Press news wire 
format"
+   private int FMT_IPTC_BLM     = 0x0A;   // "Bloomberg News news wire format"
+   private int FMT_IPTC_NYT     = 0x0B;   // "New York Times news wire format"
+   private int FMT_IPTC_RTR     = 0x0C;   // "Reuters news wire format"
+
+   private int FORMAT = FMT_ANPA_1312;    // assume the default format to be 
ANPA-1312
+
+   private final static char SOH = 0x01;    // start of header (ctrl-a)
+   private final static char STX = 0x02;    // start of text (ctrl-b)
+   private final static char ETX = 0x03;    // end of text (ctrl-c)
+   private final static char EOT = 0x04;    // the tab character (ctrl-d)
+   private final static char SYN = 0x16;    // synchronous idle (ctrl-v)
+
+   private final static char BS = 0x08;    // the backspace character (used 
for diacriticals)
+   private final static char TB = 0x09;    // the tab character
+   private final static char LF = 0x0A;    // line feed
+   private final static char FF = 0x0C;    // form feed
+   private final static char CR = 0x0D;    // carriage return
+   private final static char XQ = 0x11;    // device control (ctrl-q)
+   private final static char XS = 0x13;    // device control (ctrl-s)
+   private final static char FS = 0x1F;    // a field delimiter
+
+   private final static char HY = 0x2D;    // hyphen
+   private final static char SP = 0x20;    // the blank space
+   private final static char LT = 0x3C;    // less than
+   private final static char EQ = 0x3D;    // less than
+   private final static char CT = 0x5E;    // carat
+
+   private final static char SL = 0x91;    // single-quote left
+   private final static char SR = 0x92;    // single-quote right
+   private final static char DL = 0x93;    // double-quote left
+   private final static char DR = 0x94;    // double-quote right
+
+
+   /**
+    * scan the news messsage and store the metadata and data into a map
+    */
+   private HashMap<String,String> loadProperties(InputStream is) {
+      
+      HashMap<String,String> properties = new HashMap<String,String>();
+
+      FORMAT = this.scanFormat(is);
+
+      byte[] residual = this.getSection(is,"residual");
+
+      byte[] header = this.getSection(is,"header");
+      parseHeader(header, properties);
+
+      byte[] body = this.getSection(is,"body");
+      parseBody(body, properties);
+
+      byte[] footer = this.getSection(is,"footer");
+      parseFooter(footer, properties);
+       
+      return (properties);
+   }
+
+
+   private int scanFormat(InputStream is) {
+      int format    = this.FORMAT;
+      int  maxsize  = 524288;     //  512K
+
+      byte[] buf = new byte[maxsize];
+      try {
+         if (is.markSupported()) {
+            is.mark(maxsize);
+         }
+         int msgsize = is.read(buf);                // read in at least the 
full data
+
+         String message = (new String(buf, UTF_8)).toLowerCase(Locale.ROOT);
+         // these are not if-then-else, because we want to go from most common
+         // and fall through to least.  this is imperfect, as these tags could
+         // show up in other agency stories, but i can't find a spec or any
+         // explicit codes to identify the wire source in the message itself
+
+         if (message.contains("ap-wf")) {
+            format = this.FMT_IPTC_AP;
+         }
+         if (message.contains("reuters")) {
+            format = this.FMT_IPTC_RTR;
+         }
+         if (message.contains("new york times")) {
+            format = this.FMT_IPTC_NYT;
+         }
+         if (message.contains("bloomberg news")) {
+            format = this.FMT_IPTC_BLM;
+         }
+      }
+      catch (IOException eio) {
+         // we are in an unstable state
+      }
+
+      try {
+         if (is.markSupported()) {
+            is.reset();
+         }
+      }
+      catch (IOException eio) {
+         // we are in an unstable state
+      }
+      return (format);
+   }
+
+
+   private void setFormat(int format) {
+      this.FORMAT = format;
+   }
+
+
+   private String getFormatName() {
+      
+      String name = "";
+      
+      if (FORMAT == this.FMT_IPTC_AP) {
+         name = "Associated Press";
+      }
+      
+      else if(FORMAT == this.FMT_IPTC_BLM) {
+         name = "Bloomberg";
+      }
+
+      else if(FORMAT == this.FMT_IPTC_NYT) {
+         name = "New York Times";
+      }
+
+      else if(FORMAT == this.FMT_IPTC_RTR) {
+         name = "Reuters";
+      }
+
+      return (name);
+   }
+
+
+   private byte[] getSection(InputStream is, String name) {
+
+      byte[] value = new byte[0];
+
+      if (name.equals("residual")) {
+         // the header shouldn't be more than 1k, but just being generous here
+         int  maxsize  = 8192;     //  8K
+         byte bstart   = SYN;     // check for SYN [0x16 : ctrl-v] (may have 
leftover residue from preceding message)
+         byte bfinish  = SOH;     // check for SOH [0x01 : ctrl-a] (typically 
follows a pair of SYN [0x16 : ctrl-v])
+         value = getSection(is, maxsize, bstart, bfinish, true);
+      }
+
+      else if(name.equals("header")) {
+         // the header shouldn't be more than 1k, but just being generous here
+         int  maxsize  = 8192;     //  8K
+         byte bstart   = SOH;     // check for SOH [0x01 : ctrl-a] (typically 
follows a pair of SYN [0x16 : ctrl-v])
+         byte bfinish  = STX;     // check for STX [0x02 : ctrl-b] (marks end 
of header, beginning of message)
+         value = getSection(is, maxsize, bstart, bfinish, true);
+      }
+
+      else if (name.equals("body")) {
+         // the message shouldn't be more than 16k (?), leaving plenty of space
+         int  maxsize  = 524288;     //  512K
+         byte bstart   = STX;     // check for STX [0x02 : ctrl-b] (marks end 
of header, beginning of message)
+         byte bfinish  = ETX;     // check for ETX [0x03 : ctrl-c] (marks end 
of message, beginning of footer)
+         value = getSection(is, maxsize, bstart, bfinish, true);
+      }
+
+      else if (name.equals("footer")) {
+         // the footer shouldn't be more than 1k , leaving plenty of space
+         int maxsize   = 8192;     //  8K
+         byte bstart   = ETX;     // check for ETX [0x03 : ctrl-c] (marks end 
of message, beginning of footer)
+         byte bfinish  = EOT;     // check for EOT [0x04 : ctrl-d] (marks end 
of transmission)
+         value = getSection(is, maxsize, bstart, bfinish, true);
+      }
+
+      return (value);
+   }
+
+
+   private byte[] getSection(InputStream is, int maxsize, byte bstart, byte 
bfinish, boolean ifincomplete) {
+      byte[] value  = new byte[0];
+
+      try {
+         boolean started = false;                   // check if we have found 
the start flag
+         boolean finished = false;                  // check if we have found 
the finish flag
+         int read = 0;                              // the number of bytes we 
read
+         int start = 0;                             // the position after the 
start flag
+
+         // TODO: this only pulls back 8K of data on a read, regardless of 
buffer size
+         //       more nefariously, it caps at a total 8K, through all sections
+         int streammax = is.available();
+         maxsize = Math.min(maxsize, streammax);
+
+         is.mark(maxsize);
+         byte[] buf = new byte[maxsize];
+         int totsize = 0;
+         int remainder = maxsize - totsize;
+         while (remainder > 0) {
+            int msgsize = is.read(buf, maxsize-remainder, maxsize);    // read 
in at least the full data
+            if (msgsize == -1) {
+               remainder = msgsize = 0;
+            }
+            remainder -= msgsize;
+            totsize   += msgsize;
+         }
+
+         // scan through the provided input stream
+         for (read=0; read < totsize; read++) {
+            byte b = buf[read];
+
+            if (!started) {
+               started = (b == bstart);
+               start = read + 1;
+               continue;
+            }
+
+            if (finished = (b == bfinish)) {
+/*
+               is.reset();
+               long skipped = is.skip((long)read);
+               if (skipped != read) {
+                  // we are in an unstable state
+               }
+               is.mark(1);
+ */
+               break;
+            }
+
+            // load from the stream until we run out of characters, or hit the 
termination byte
+            continue;
+         }
+
+         // move the input stream back to where it was initially
+         is.reset();
+
+         if (finished) {
+            // now, we want to reset the stream to be sitting right on top of 
the finish marker
+            is.skip(read);
+            value = new byte[read-start];
+            System.arraycopy(buf, start, value, 0, read-start);
+         }
+         else {
+            if (ifincomplete && started) {
+               // the caller wants anything that was read, and we finished the 
stream or buffer
+               value = new byte[read-start];
+               System.arraycopy(buf, start, value, 0, read-start);
+            }
+         }
+      }
+      catch (IOException eio) {
+         // something invalid occurred, return an empty string
+      }
+
+      return (value);
+   }
+
+
+   private boolean parseHeader(byte[] value, HashMap<String,String> 
properties) {
+      boolean added = false;
+
+      String env_serviceid = "";
+      String env_category = "";
+      String env_urgency = "";
+      String hdr_edcode = "";
+      String hdr_subject = "";
+      String hdr_date = "";
+      String hdr_time = "";
+
+      int read = 0;
+
+      while (read < value.length) {
+
+         // pull apart the envelope, getting the service id  (....\x1f)
+         while (read < value.length) {
+            byte val_next = value[read++];
+            if (val_next != FS) {
+               env_serviceid += (char)(val_next & 0xff);  // convert the byte 
to an unsigned int
+            }
+            else {
+               break;
+            }
+         }
+
+         // pull apart the envelope, getting the category  (....\x13\x11)
+         while (read < value.length) {
+            byte val_next = value[read++];
+            if (val_next != XS) {   // the end of the envelope is marked (\x13)
+               env_category += (char)(val_next & 0xff);  // convert the byte 
to an unsigned int
+            }
+            else {
+               val_next = value[read];  // get the remaining byte (\x11)
+               if (val_next == XQ) {
+                  read++;
+               }
+               break;
+            }
+         }
+
+         // pull apart the envelope, getting the subject heading
+         while (read < value.length) {
+            boolean subject = true;
+            byte val_next = value[read++];
+            while ((subject) && (val_next != SP) && (val_next != 0x00)) {  // 
ignore the envelope subject
+               hdr_subject += (char)(val_next & 0xff);  // convert the byte to 
an unsigned int
+               val_next =  (read < value.length) ? value[read++] : 0x00;
+               while (val_next == SP) {  // consume all the spaces
+                  subject = false;
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+                  if (val_next != SP) {
+                     --read;  // otherwise we eat into the next section
+                  }
+               }
+            }
+            if (!subject) {
+               break;
+            }
+         }
+
+         // pull apart the envelope, getting the date and time
+         while (read < value.length) {
+            byte val_next = value[read++];
+            if (hdr_date.length() == 0) {
+               while (((val_next >= (byte)0x30) && (val_next <= (byte)0x39))  
// consume all numerics and hyphens
+                  ||   (val_next == HY)) {
+                  hdr_date += (char)(val_next & 0xff);  // convert the byte to 
an unsigned int
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+               }
+            }
+            else if (val_next == SP) {
+               while (val_next == SP) {  // consume all the spaces
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+               }
+               continue;
+            }
+            else {
+               while (((val_next >= (byte)0x30) && (val_next <= (byte)0x39))  
// consume all numerics and hyphens
+                  ||   (val_next == HY)) {
+                  hdr_time += (char)(val_next & 0xff);  // convert the byte to 
an unsigned int
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+               }
+            }
+         }
+         break; // don't let this run back through and start thrashing metadata
+      }
+
+      // if we were saving any of these values, we would set the properties 
map here
+
+      added = (env_serviceid.length() + env_category.length() + 
hdr_subject.length() + 
+               hdr_date.length() + hdr_time.length()) > 0; 
+      return added;
+   }
+
+   private boolean parseBody(byte[] value, HashMap<String,String> properties) {
+      boolean added = false;
+
+      String bdy_heading = "";
+      String bdy_title = "";
+      String bdy_source = "";
+      String bdy_author = "";
+      String bdy_body = "";
+
+      int read = 0;
+      boolean done = false;
+
+      while (!done && (read < value.length)) {
+
+         // pull apart the body, getting the heading (^....\x0d\x0a)
+         while (read < value.length) {
+            byte val_next = value[read++];
+            if (val_next == CT) {      //  start of a new section , first is 
the heading
+               val_next =  (read < value.length) ? value[read++] : 0x00;
+               // AP, NYT, and Bloomberg end with < , Reuters with EOL
+               while ((val_next != LT) && (val_next != CR) && (val_next != 
LF)) {   // less than delimiter (\x3c) and not EOL
+                  bdy_heading += (char)(val_next & 0xff);  // convert the byte 
to an unsigned int
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+                  if (read > value.length) { break; }  // shouldn't ever hit 
this, but save a NPE
+               }
+               if (val_next == LT) {
+                  // hit the delimiter, carry on
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+               }
+               while (bdy_heading.length() > 0 && ((val_next == CR) || 
(val_next == LF))) {
+                  val_next =  (read < value.length) ? value[read++] : 0x00;  
// skip the new lines
+                  if ((val_next != CR) && (val_next != LF)) {
+                     --read;
+                  }
+               }
+            }
+            else {
+               // this will only be hit on poorly-formed files
+
+               // for reuters, the heading does not start with the ^, so we 
push one back into the stream
+               if (FORMAT == this.FMT_IPTC_RTR) {
+                  if (val_next != CT) {
+                     // for any non-whitespace, we need to go back an 
additional step to non destroy the data
+                     if ((val_next != SP) && (val_next != TB) && (val_next != 
CR) && (val_next != LF)) {
+                        // if the very first byte is data, we have to shift 
the whole array, and stuff in a carat
+                        if (read == 1) {
+                           byte[] resize = new byte[value.length + 1];
+                           System.arraycopy(value, 0, resize, 1, value.length);
+                           value = resize;
+                        }
+                     }
+                     value[--read] = CT;
+                     continue;
+                  }
+               }
+            }
+            break;
+         }
+
+         // pull apart the body, getting the title (^....\x0d\x0a)
+         while (read < value.length) {
+            byte val_next = value[read++];
+            if (val_next == CT) {      //  start of a new section , first is 
the heading
+               val_next =  (read < value.length) ? value[read++] : 0x00;
+               // AP, NYT, and Bloomberg end with < , Reuters with EOL
+               while ((val_next != LT) && (val_next != CT) && (val_next != CR) 
&& (val_next != LF)) {   // less than delimiter (\x3c), or carat (\x5e) and not 
EOL
+                  bdy_title += (char)(val_next & 0xff);  // convert the byte 
to an unsigned int
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+                  if (read > value.length) { break; }  // shouldn't ever hit 
this, but save a NPE
+               }
+
+               if (val_next == CT) {      //  start of a new section , when 
first didn't finish cleanly
+                   --read;
+               }
+
+               if (val_next == LT) {
+                  // hit the delimiter, carry on
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+               }
+
+               while (bdy_title.length() > 0 && ((val_next == CR) || (val_next 
== LF))) {
+                  val_next =  (read < value.length) ? value[read++] : 0x00;  
// skip the new lines
+                  if ((val_next != CR) && (val_next != LF)) {
+                     --read;
+                  }
+               }
+            }
+            else {
+               // this will only be hit on poorly-formed files
+
+               // for bloomberg, the title does not start with the ^, so we 
push one back into the stream
+               if (FORMAT == this.FMT_IPTC_BLM) {
+                  if (val_next == TB) {
+                     value[--read] = CT;
+                     continue;
+                  }
+               }
+
+               // for reuters, the title does not start with the ^, so we push 
one back into the stream
+               if (FORMAT == this.FMT_IPTC_RTR) {
+                  if (val_next != CT) {
+                     // for any non-whitespace, we need to go back an 
additional step to non destroy the data
+                     if ((val_next != SP) && (val_next != TB) && (val_next != 
CR) && (val_next != LF)) {
+                        --read;
+                     }
+                     value[--read] = CT;
+                     continue;
+                  }
+               }
+            }
+            break;
+         }
+
+
+         // at this point, we have a variable number of metadata lines, with 
various orders
+         // we scan the start of each line for the special character, and run 
to the end character
+         // pull apart the body, getting the title (^....\x0d\x0a)
+         boolean metastarted = false;
+         String longline = "";
+         String longkey = "";
+         while (read < value.length) {
+            byte val_next = value[read++];
+
+            // eat up whitespace before committing to the next section
+            if ((val_next == SP) || (val_next == TB) || (val_next == CR) || 
(val_next == LF)) {
+               continue;
+            }
+
+            if (val_next == CT) {      //  start of a new section , could be 
authors, sources, etc
+               val_next =  (read < value.length) ? value[read++] : 0x00;
+               String tmp_line = "";
+               while ((val_next != LT) && (val_next != CT) && (val_next != CR) 
&& (val_next != LF) && (val_next != 0))  {
+                  // less than delimiter (\x3c), maybe also badly formed with 
just new line
+                  tmp_line += (char)(val_next & 0xff);  // convert the byte to 
an unsigned int
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+                  if (read > value.length) { break; }  // shouldn't ever hit 
this, but save a NPE
+               }
+
+               if (val_next == CT) {      //  start of a new section , when 
first didn't finish cleanly
+                   --read;
+               }
+
+               if (val_next == LT) {
+                  // hit the delimiter, carry on
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+               }
+
+               while ((val_next == CR) || (val_next == LF)) {
+                  val_next =  (read < value.length) ? value[read++] : 0x00;  
// skip the new lines
+                  if ((val_next != CR) && (val_next != LF)) {
+                     --read;
+                  }
+               }
+               if (tmp_line.toLowerCase(Locale.ROOT).startsWith("by") || 
longline.equals("bdy_author")) {
+                  longkey = "bdy_author";
+
+                  // prepend a space to subsequent line, so it gets parsed 
consistent with the lead line
+                  tmp_line = (longline.equals(longkey) ? " " : "") + tmp_line;
+
+                  // we have an author candidate
+                  int term = tmp_line.length();
+                  term = Math.min(term, (tmp_line.contains("<") ? 
tmp_line.indexOf("<")  : term));
+                  term = Math.min(term, (tmp_line.contains("=") ? 
tmp_line.indexOf("=")  : term));
+                  term = Math.min(term, (tmp_line.contains("\n") ? 
tmp_line.indexOf("\n") : term));
+                  term = (term > 0 ) ? term : tmp_line.length();
+                  bdy_author += tmp_line.substring(tmp_line.indexOf(" "), 
term);
+                  metastarted = true;
+                  longline = ((tmp_line.contains("=")) && 
(!longline.equals(longkey)) ? longkey : "");
+               }
+               else if (FORMAT == this.FMT_IPTC_BLM) {
+                  String byline = "   by ";
+                  if (tmp_line.toLowerCase(Locale.ROOT).contains(byline)) {
+                     longkey = "bdy_author";
+
+                     int term = tmp_line.length();
+                     term = Math.min(term, (tmp_line.contains("<") ? 
tmp_line.indexOf("<")  : term));
+                     term = Math.min(term, (tmp_line.contains("=") ? 
tmp_line.indexOf("=")  : term));
+                     term = Math.min(term, (tmp_line.contains("\n") ? 
tmp_line.indexOf("\n") : term));
+                     term = (term > 0 ) ? term : tmp_line.length();
+                     // for bloomberg, the author line sits below their 
copyright statement
+                     bdy_author += 
tmp_line.substring(tmp_line.toLowerCase(Locale.ROOT).indexOf(byline) + 
byline.length(), term) + " ";
+                     metastarted = true;
+                     longline = ((tmp_line.contains("=")) && 
(!longline.equals(longkey)) ? longkey : "");
+                  }
+                  else if(tmp_line.toLowerCase(Locale.ROOT).startsWith("c.")) {
+                     // the author line for bloomberg is a multiline starting 
with c.2011 Bloomberg News
+                     // then containing the author info on the next line
+                     if (val_next == TB) {
+                        value[--read] = CT;
+                        continue;
+                     }
+                  }
+                  else 
if(tmp_line.toLowerCase(Locale.ROOT).trim().startsWith("(") && 
tmp_line.toLowerCase(Locale.ROOT).trim().endsWith(")")) {
+                     // the author line may have one or more comment lines 
between the copyright
+                     // statement, and the By AUTHORNAME line
+                     if (val_next == TB) {
+                        value[--read] = CT;
+                        continue;
+                     }
+                  }
+               }
+
+               else if (tmp_line.toLowerCase(Locale.ROOT).startsWith("eds") || 
longline.equals("bdy_source")) {
+                  longkey = "bdy_source";
+                  // prepend a space to subsequent line, so it gets parsed 
consistent with the lead line
+                  tmp_line = (longline.equals(longkey) ? " " : "") + tmp_line;
+
+                  // we have a source candidate
+                  int term = tmp_line.length();
+                  term = Math.min(term, (tmp_line.contains("<") ? 
tmp_line.indexOf("<")  : term));
+                  term = Math.min(term, (tmp_line.contains("=") ? 
tmp_line.indexOf("=")  : term));
+//                  term = Math.min(term, (tmp_line.indexOf("\n") > -1 ? 
tmp_line.indexOf("\n") : term));
+                  term = (term > 0 ) ? term : tmp_line.length();
+                  bdy_source += tmp_line.substring(tmp_line.indexOf(" ") + 1, 
term) + " ";
+                  metastarted = true;
+                  longline = (!longline.equals(longkey) ? longkey  : "");
+               }
+               else {
+                  // this has fallen all the way through.  trap it as part of 
the subject,
+                  // rather than just losing it
+                  if (!metastarted) {
+                     bdy_title += " , " + tmp_line;     //  not sure where 
else to put this but in the title
+                  }
+                  else {
+                     // what to do with stuff that is metadata, which falls 
after metadata lines started?
+                     bdy_body += " " + tmp_line + " , ";     //  not sure 
where else to put this but in the title
+                  }
+               }
+            }
+            else {  // we're on to the main body
+               while ((read < value.length) && (val_next != 0))  {
+                  // read until the train runs out of tracks
+                  bdy_body += (char)(val_next & 0xff);  // convert the byte to 
an unsigned int
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+                  if (read > value.length) { break; }  // shouldn't ever hit 
this, but save a NPE
+               }
+
+            }
+            // we would normally break here, but just let this read out to the 
end
+         }
+         done = true; // don't let this run back through and start thrashing 
metadata
+      }
+      properties.put("body", bdy_body);
+      properties.put("title", bdy_title);
+      properties.put("subject", bdy_heading);
+      properties.put("author", bdy_author);
+      properties.put("source", bdy_source);
+
+      added = (bdy_body.length() + bdy_title.length() + bdy_heading.length() + 
bdy_author.length() +
+               bdy_source.length()) > 0;
+      return added;
+   }
+
+
+   private boolean parseFooter(byte[] value, HashMap<String,String> 
properties) {
+      boolean added = false;
+
+      String ftr_source = "";
+      String ftr_datetime = "";
+
+      int read = 0;
+      boolean done = false;
+
+      while (!done && (read < value.length)) {
+
+         // pull apart the footer, getting the news feed source (^....\x0d\x0a)
+         byte val_next = value[read++];
+         byte val_peek =  (read < value.length) ? value[read+1] : 0x00;  // 
skip the new lines
+
+         while (((val_next < (byte)0x30) || (val_next > (byte)0x39)) && 
(val_next != 0)) {  // consume all non-numerics first
+            ftr_source += (char)(val_next & 0xff);  // convert the byte to an 
unsigned int
+            val_next =  (read < value.length) ? value[read] : 0x00;  // 
attempt to read until end of stream
+            read++;
+            if (read > value.length) { break; }  // shouldn't ever hit this, 
but save a NPE
+         }
+
+         while ((val_next != LT) && (val_next != CR) && (val_next != LF) && 
(val_next != 0))  {  // get as much timedate as possible
+            // this is an american format, so arrives as mm-dd-yy HHiizzz
+            ftr_datetime += (char)(val_next & 0xff);  // convert the byte to 
an unsigned int
+            val_next =  (read < value.length) ? value[read++] : 0x00;  // skip 
the new lines
+            if (read > value.length) { break; }  // shouldn't ever hit this, 
but save a NPE
+         }
+         if (val_next == LT) {
+            // hit the delimiter, carry on
+            val_next =  (read < value.length) ? value[read++] : 0x00;
+         }
+
+         if (ftr_datetime.length() > 0) {
+            // we want to pass this back in a more friendly format
+            String format_out = "yyyy-MM-dd'T'HH:mm:ss'Z'";
+            Date dateunix = new Date();
+            try {
+               // standard ap format
+               String format_in = "MM-dd-yy HHmmzzz";
+
+               if (FORMAT == this.FMT_IPTC_RTR) {
+                  // standard reuters format
+                  format_in = "HH:mm MM-dd-yy";
+               }
+               SimpleDateFormat dfi = new SimpleDateFormat(format_in, 
Locale.ROOT);
+               dfi.setTimeZone(TimeZone.getTimeZone("UTC"));
+               dateunix = dfi.parse(ftr_datetime);
+            }
+            catch (ParseException ep) {
+               // failed, but this will just fall through to setting the date 
to now
+            }
+            SimpleDateFormat dfo = new SimpleDateFormat(format_out, 
Locale.ROOT);
+            dfo.setTimeZone(TimeZone.getTimeZone("UTC"));
+            ftr_datetime = dfo.format(dateunix);
+         }
+         while ((val_next == CR) || (val_next == LF)) {
+            val_next =  (read < value.length) ? value[read++] : 0x00;  // skip 
the new lines
+            if ((val_next != CR) && (val_next != LF)) {
+               --read;
+            }
+         }
+         done = true; // don't let this run back through and start thrashing 
metadata
+      }
+
+      properties.put("publisher", ftr_source);
+      properties.put("created", ftr_datetime);
+      properties.put("modified", ftr_datetime);
+
+      added = (ftr_source.length() + ftr_datetime.length()) > 0; 
+      return added;
+   }
+
+
+   private void setMetadata(Metadata metadata, HashMap<String,String> 
properties) {
+
+      // every property that gets set must be non-null, or it will cause NPE
+      // in other consuming applications, like Lucene
+      metadata.set(Metadata.CONTENT_TYPE,  clean("text/anpa-1312"));
+      metadata.set(TikaCoreProperties.TITLE,         
clean(properties.get("title")));
+      metadata.set(TikaCoreProperties.KEYWORDS,       
clean(properties.get("subject")));
+      metadata.set(TikaCoreProperties.CREATOR,        
clean(properties.get("author")));
+      metadata.set(TikaCoreProperties.CREATED, 
clean(properties.get("created")));
+      metadata.set(TikaCoreProperties.MODIFIED,      
clean(properties.get("modified")));
+      metadata.set(TikaCoreProperties.SOURCE,      
clean(properties.get("source")));
+//      metadata.set(TikaCoreProperties.PUBLISHER,     
clean(properties.get("publisher")));
+      metadata.set(TikaCoreProperties.PUBLISHER,     
clean(this.getFormatName()));
+
+/*
+        metadata.set(TikaCoreProperties.DATE, 
font.getHeader().getCreated().getTime());
+        metadata.set(
+                Property.internalDate(TikaCoreProperties.MODIFIED),
+                font.getHeader().getModified().getTime());
+*/
+   }
+
+   private String clean(String value) {
+      if (value == null) {
+         value = "";
+      }
+
+      value = value.replaceAll("``", "`");
+      value = value.replaceAll("''", "'");
+      value = value.replaceAll(new String(new char[] {SL}), "'");
+      value = value.replaceAll(new String(new char[] {SR}), "'");
+      value = value.replaceAll(new String(new char[] {DL}), "\"");
+      value = value.replaceAll(new String(new char[] {DR}), "\"");
+      value = value.trim();
+
+      return (value);
+   }
+}


Added: 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,274 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mail;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.james.mime4j.MimeException;
+import org.apache.james.mime4j.codec.DecodeMonitor;
+import org.apache.james.mime4j.codec.DecoderUtil;
+import org.apache.james.mime4j.dom.address.Address;
+import org.apache.james.mime4j.dom.address.AddressList;
+import org.apache.james.mime4j.dom.address.Mailbox;
+import org.apache.james.mime4j.dom.address.MailboxList;
+import org.apache.james.mime4j.dom.field.AddressListField;
+import org.apache.james.mime4j.dom.field.DateTimeField;
+import org.apache.james.mime4j.dom.field.MailboxListField;
+import org.apache.james.mime4j.dom.field.ParsedField;
+import org.apache.james.mime4j.dom.field.UnstructuredField;
+import org.apache.james.mime4j.field.LenientFieldParser;
+import org.apache.james.mime4j.parser.ContentHandler;
+import org.apache.james.mime4j.stream.BodyDescriptor;
+import org.apache.james.mime4j.stream.Field;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Bridge between mime4j's content handler and the generic Sax content handler
+ * used by Tika. See
+ * 
http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ContentHandler.html
+ */
+class MailContentHandler implements ContentHandler {
+
+    private boolean strictParsing = false;
+
+    private XHTMLContentHandler handler;
+    private Metadata metadata;
+    private EmbeddedDocumentExtractor extractor;
+
+    private boolean inPart = false;
+
+    MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata, 
ParseContext context, boolean strictParsing) {
+        this.handler = xhtml;
+        this.metadata = metadata;
+        this.strictParsing = strictParsing;
+
+        // Fetch / Build an EmbeddedDocumentExtractor with which
+        //  to handle/process the parts/attachments
+
+        // Was an EmbeddedDocumentExtractor explicitly supplied?
+        this.extractor = context.get(EmbeddedDocumentExtractor.class);
+
+        // If there's no EmbeddedDocumentExtractor, then try using a normal 
parser
+        // This will ensure that the contents are made available to the user, 
so
+        //  the see the text, but without fine-grained control/extraction
+        // (This also maintains backward compatibility with older versions!)
+        if (this.extractor == null) {
+            // If the user gave a parser, use that, if not the default
+            Parser parser = context.get(AutoDetectParser.class);
+            if (parser == null) {
+                parser = context.get(Parser.class);
+            }
+            if (parser == null) {
+                TikaConfig tikaConfig = context.get(TikaConfig.class);
+                if (tikaConfig == null) {
+                    tikaConfig = TikaConfig.getDefaultConfig();
+                }
+                parser = new AutoDetectParser(tikaConfig.getParser());
+            }
+            ParseContext ctx = new ParseContext();
+            ctx.set(Parser.class, parser);
+            extractor = new ParsingEmbeddedDocumentExtractor(ctx);
+        }
+    }
+
+    public void body(BodyDescriptor body, InputStream is) throws MimeException,
+            IOException {
+        // use a different metadata object
+        // in order to specify the mime type of the
+        // sub part without damaging the main metadata
+
+        Metadata submd = new Metadata();
+        submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
+        submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
+
+        try {
+            if (extractor.shouldParseEmbedded(submd)) {
+                extractor.parseEmbedded(is, handler, submd, false);
+            }
+        } catch (SAXException e) {
+            throw new MimeException(e);
+        }
+    }
+
+    public void endBodyPart() throws MimeException {
+        try {
+            handler.endElement("p");
+            handler.endElement("div");
+        } catch (SAXException e) {
+            throw new MimeException(e);
+        }
+    }
+
+    public void endHeader() throws MimeException {
+    }
+
+    public void startMessage() throws MimeException {
+        try {
+            handler.startDocument();
+        } catch (SAXException e) {
+            throw new MimeException(e);
+        }
+    }
+
+    public void endMessage() throws MimeException {
+        try {
+            handler.endDocument();
+        } catch (SAXException e) {
+            throw new MimeException(e);
+        }
+    }
+
+    public void endMultipart() throws MimeException {
+        inPart = false;
+    }
+
+    public void epilogue(InputStream is) throws MimeException, IOException {
+    }
+
+    /**
+     * Header for the whole message or its parts
+     *
+     * @see 
http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/
+     * Field.html
+     */
+    public void field(Field field) throws MimeException {
+        // inPart indicates whether these metadata correspond to the
+        // whole message or its parts
+        if (inPart) {
+            return;
+        }
+
+        try {
+            String fieldname = field.getName();
+            ParsedField parsedField = LenientFieldParser.getParser().parse(
+                    field, DecodeMonitor.SILENT);
+            if (fieldname.equalsIgnoreCase("From")) {
+                MailboxListField fromField = (MailboxListField) parsedField;
+                MailboxList mailboxList = fromField.getMailboxList();
+                if (fromField.isValidField() && mailboxList != null) {
+                    for (Address address : mailboxList) {
+                        String from = getDisplayString(address);
+                        metadata.add(Metadata.MESSAGE_FROM, from);
+                        metadata.add(TikaCoreProperties.CREATOR, from);
+                    }
+                } else {
+                    String from = stripOutFieldPrefix(field, "From:");
+                    if (from.startsWith("<")) {
+                        from = from.substring(1);
+                    }
+                    if (from.endsWith(">")) {
+                        from = from.substring(0, from.length() - 1);
+                    }
+                    metadata.add(Metadata.MESSAGE_FROM, from);
+                    metadata.add(TikaCoreProperties.CREATOR, from);
+                }
+            } else if (fieldname.equalsIgnoreCase("Subject")) {
+                metadata.add(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE,
+                        ((UnstructuredField) parsedField).getValue());
+            } else if (fieldname.equalsIgnoreCase("To")) {
+                processAddressList(parsedField, "To:", Metadata.MESSAGE_TO);
+            } else if (fieldname.equalsIgnoreCase("CC")) {
+                processAddressList(parsedField, "Cc:", Metadata.MESSAGE_CC);
+            } else if (fieldname.equalsIgnoreCase("BCC")) {
+                processAddressList(parsedField, "Bcc:", Metadata.MESSAGE_BCC);
+            } else if (fieldname.equalsIgnoreCase("Date")) {
+                DateTimeField dateField = (DateTimeField) parsedField;
+                metadata.set(TikaCoreProperties.CREATED, dateField.getDate());
+            }
+        } catch (RuntimeException me) {
+            if (strictParsing) {
+                throw me;
+            }
+        }
+    }
+
+    private void processAddressList(ParsedField field, String addressListType,
+                                    String metadataField) throws MimeException 
{
+        AddressListField toField = (AddressListField) field;
+        if (toField.isValidField()) {
+            AddressList addressList = toField.getAddressList();
+            for (Address address : addressList) {
+                metadata.add(metadataField, getDisplayString(address));
+            }
+        } else {
+            String to = stripOutFieldPrefix(field,
+                    addressListType);
+            for (String eachTo : to.split(",")) {
+                metadata.add(metadataField, eachTo.trim());
+            }
+        }
+    }
+
+    private String getDisplayString(Address address) {
+        if (address instanceof Mailbox) {
+            Mailbox mailbox = (Mailbox) address;
+            String name = mailbox.getName();
+            if (name != null && name.length() > 0) {
+                name = DecoderUtil.decodeEncodedWords(name, 
DecodeMonitor.SILENT);
+                return name + " <" + mailbox.getAddress() + ">";
+            } else {
+                return mailbox.getAddress();
+            }
+        } else {
+            return address.toString();
+        }
+    }
+
+    public void preamble(InputStream is) throws MimeException, IOException {
+    }
+
+    public void raw(InputStream is) throws MimeException, IOException {
+    }
+
+    public void startBodyPart() throws MimeException {
+        try {
+            handler.startElement("div", "class", "email-entry");
+            handler.startElement("p");
+        } catch (SAXException e) {
+            throw new MimeException(e);
+        }
+    }
+
+    public void startHeader() throws MimeException {
+        // TODO Auto-generated method stub
+
+    }
+
+    public void startMultipart(BodyDescriptor descr) throws MimeException {
+        inPart = true;
+    }
+
+    private String stripOutFieldPrefix(Field field, String fieldname) {
+        String temp = field.getRaw().toString();
+        int loc = fieldname.length();
+        while (temp.charAt(loc) == ' ') {
+            loc++;
+        }
+        return temp.substring(loc);
+    }
+
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mail;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.james.mime4j.MimeException;
+import org.apache.james.mime4j.parser.MimeStreamParser;
+import org.apache.james.mime4j.stream.MimeConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Uses apache-mime4j to parse emails. Each part is treated with the
+ * corresponding parser and displayed within elements.
+ * <p/>
+ * A {@link MimeEntityConfig} object can be passed in the parsing context
+ * to better control the parsing process.
+ *
+ * @author [email protected]
+ */
+public class RFC822Parser extends AbstractParser {
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -5504243905998074168L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections
+            .singleton(MediaType.parse("message/rfc822"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler,
+                      Metadata metadata, ParseContext context) throws 
IOException,
+            SAXException, TikaException {
+        // Get the mime4j configuration, or use a default one
+        MimeConfig config = new MimeConfig();
+        config.setMaxLineLen(100000);
+        config.setMaxHeaderLen(100000); // max length of any individual header
+        config = context.get(MimeConfig.class, config);
+
+        MimeStreamParser parser = new MimeStreamParser(config);
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+        MailContentHandler mch = new MailContentHandler(
+                xhtml, metadata, context, config.isStrictParsing());
+        parser.setContentHandler(mch);
+        parser.setContentDecoding(true);
+        
+        TikaInputStream tstream = TikaInputStream.get(stream);
+        try {
+            parser.parse(tstream);
+        } catch (IOException e) {
+            tstream.throwIfCauseOf(e);
+            throw new TikaException("Failed to parse an email message", e);
+        } catch (MimeException e) {
+            // Unwrap the exception in case it was not thrown by mime4j
+            Throwable cause = e.getCause();
+            if (cause instanceof TikaException) {
+                throw (TikaException) cause;
+            } else if (cause instanceof SAXException) {
+                throw (SAXException) cause;
+            } else {
+                throw new TikaException("Failed to parse an email message", e);
+            }
+        }
+    }
+
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Queue;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Mbox (mailbox) parser. This version extracts each mail from Mbox and uses 
the
+ * DelegatingParser to process each mail.
+ */
+public class MboxParser extends AbstractParser {
+
+    public static final String MBOX_MIME_TYPE = "application/mbox";
+    public static final String MBOX_RECORD_DIVIDER = "From ";
+    public static final int MAIL_MAX_SIZE = 50000000;
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -1762689436731160661L;
+    private static final Set<MediaType> SUPPORTED_TYPES = 
Collections.singleton(MediaType.application("mbox"));
+    private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ 
]+):[ \t]*(.*)");
+    private static final Pattern EMAIL_ADDRESS_PATTERN = 
Pattern.compile("<(.*@.*)>");
+
+    private static final String EMAIL_HEADER_METADATA_PREFIX = "MboxParser-";
+    private static final String EMAIL_FROMLINE_METADATA = 
EMAIL_HEADER_METADATA_PREFIX + "from";
+    private final Map<Integer, Metadata> trackingMetadata = new 
HashMap<Integer, Metadata>();
+    private boolean tracking = false;
+
+    public static Date parseDate(String headerContent) throws ParseException {
+        SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy 
HH:mm:ss Z", Locale.US);
+        return dateFormat.parse(headerContent);
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context)
+            throws IOException, TikaException, SAXException {
+
+        EmbeddedDocumentExtractor extractor = 
context.get(EmbeddedDocumentExtractor.class,
+                new ParsingEmbeddedDocumentExtractor(context));
+
+        String charsetName = "windows-1252";
+
+        metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
+        metadata.set(Metadata.CONTENT_ENCODING, charsetName);
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        InputStreamReader isr = new InputStreamReader(stream, charsetName);
+        try (BufferedReader reader = new BufferedReader(isr)) {
+            String curLine = reader.readLine();
+            int mailItem = 0;
+            do {
+                if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
+                    Metadata mailMetadata = new Metadata();
+                    Queue<String> multiline = new LinkedList<String>();
+                    mailMetadata.add(EMAIL_FROMLINE_METADATA, 
curLine.substring(MBOX_RECORD_DIVIDER.length()));
+                    mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
+                    curLine = reader.readLine();
+
+                    ByteArrayOutputStream message = new 
ByteArrayOutputStream(100000);
+                    do {
+                        if (curLine.startsWith(" ") || 
curLine.startsWith("\t")) {
+                            String latestLine = multiline.poll();
+                            latestLine += " " + curLine.trim();
+                            multiline.add(latestLine);
+                        } else {
+                            multiline.add(curLine);
+                        }
+
+                        message.write(curLine.getBytes(charsetName));
+                        message.write(0x0A);
+                        curLine = reader.readLine();
+                    }
+                    while (curLine != null && 
!curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE);
+
+                    for (String item : multiline) {
+                        saveHeaderInMetadata(mailMetadata, item);
+                    }
+
+                    ByteArrayInputStream messageStream = new 
ByteArrayInputStream(message.toByteArray());
+                    message = null;
+
+                    if (extractor.shouldParseEmbedded(mailMetadata)) {
+                        extractor.parseEmbedded(messageStream, xhtml, 
mailMetadata, true);
+                    }
+
+                    if (tracking) {
+                        getTrackingMetadata().put(mailItem++, mailMetadata);
+                    }
+                } else {
+                    curLine = reader.readLine();
+                }
+
+            } while (curLine != null && 
!Thread.currentThread().isInterrupted());
+        }
+
+        xhtml.endDocument();
+    }
+
+    public boolean isTracking() {
+        return tracking;
+    }
+
+    public void setTracking(boolean tracking) {
+        this.tracking = tracking;
+    }
+
+    public Map<Integer, Metadata> getTrackingMetadata() {
+        return trackingMetadata;
+    }
+
+    private void saveHeaderInMetadata(Metadata metadata, String curLine) {
+        Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
+        if (!headerMatcher.matches()) {
+            return; // ignore malformed header lines
+        }
+
+        String headerTag = headerMatcher.group(1).toLowerCase(Locale.ROOT);
+        String headerContent = headerMatcher.group(2);
+
+        if (headerTag.equalsIgnoreCase("From")) {
+            metadata.set(TikaCoreProperties.CREATOR, headerContent);
+        } else if (headerTag.equalsIgnoreCase("To") || 
headerTag.equalsIgnoreCase("Cc")
+                || headerTag.equalsIgnoreCase("Bcc")) {
+            Matcher address = EMAIL_ADDRESS_PATTERN.matcher(headerContent);
+            if (address.find()) {
+                metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, 
address.group(1));
+            } else if (headerContent.indexOf('@') > -1) {
+                metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, 
headerContent);
+            }
+
+            String property = Metadata.MESSAGE_TO;
+            if (headerTag.equalsIgnoreCase("Cc")) {
+                property = Metadata.MESSAGE_CC;
+            } else if (headerTag.equalsIgnoreCase("Bcc")) {
+                property = Metadata.MESSAGE_BCC;
+            }
+            metadata.add(property, headerContent);
+        } else if (headerTag.equalsIgnoreCase("Subject")) {
+            metadata.add(Metadata.SUBJECT, headerContent);
+        } else if (headerTag.equalsIgnoreCase("Date")) {
+            try {
+                Date date = parseDate(headerContent);
+                metadata.set(TikaCoreProperties.CREATED, date);
+            } catch (ParseException e) {
+                // ignoring date because format was not understood
+            }
+        } else if (headerTag.equalsIgnoreCase("Message-Id")) {
+            metadata.set(TikaCoreProperties.IDENTIFIER, headerContent);
+        } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
+            metadata.set(TikaCoreProperties.RELATION, headerContent);
+        } else if (headerTag.equalsIgnoreCase("Content-Type")) {
+            // TODO - key off content-type in headers to
+            // set mapping to use for content and convert if necessary.
+
+            metadata.add(Metadata.CONTENT_TYPE, headerContent);
+            metadata.set(TikaCoreProperties.FORMAT, headerContent);
+        } else {
+            metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, 
headerContent);
+        }
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static java.lang.String.valueOf;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.util.Collections.singleton;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Set;
+
+import com.pff.PSTAttachment;
+import com.pff.PSTFile;
+import com.pff.PSTFolder;
+import com.pff.PSTMessage;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Parser for MS Outlook PST email storage files
+ */
+public class OutlookPSTParser extends AbstractParser {
+
+    private static final long serialVersionUID = 620998217748364063L;
+
+    public static final MediaType MS_OUTLOOK_PST_MIMETYPE = 
MediaType.application("vnd.ms-outlook-pst");
+    private static final Set<MediaType> SUPPORTED_TYPES = 
singleton(MS_OUTLOOK_PST_MIMETYPE);
+
+    private static AttributesImpl createAttribute(String attName, String 
attValue) {
+        AttributesImpl attributes = new AttributesImpl();
+        attributes.addAttribute("", attName, attName, "CDATA", attValue);
+        return attributes;
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        // Use the delegate parser to parse the contained document
+        EmbeddedDocumentExtractor embeddedExtractor = 
context.get(EmbeddedDocumentExtractor.class,
+                new ParsingEmbeddedDocumentExtractor(context));
+
+        metadata.set(Metadata.CONTENT_TYPE, 
MS_OUTLOOK_PST_MIMETYPE.toString());
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        TikaInputStream in = TikaInputStream.get(stream);
+        PSTFile pstFile = null;
+        try {
+            pstFile = new PSTFile(in.getFile().getPath());
+            metadata.set(Metadata.CONTENT_LENGTH, 
valueOf(pstFile.getFileHandle().length()));
+            boolean isValid = pstFile.getFileHandle().getFD().valid();
+            metadata.set("isValid", valueOf(isValid));
+            if (isValid) {
+                parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor);
+            }
+        } catch (Exception e) {
+            throw new TikaException(e.getMessage(), e);
+        } finally {
+            if (pstFile != null && pstFile.getFileHandle() != null) {
+                try {
+                    pstFile.getFileHandle().close();
+                } catch (IOException e) {
+                    //swallow closing exception
+                }
+            }
+        }
+
+        xhtml.endDocument();
+    }
+
+    private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, 
EmbeddedDocumentExtractor embeddedExtractor)
+            throws Exception {
+        if (pstFolder.getContentCount() > 0) {
+            PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
+            while (pstMail != null) {
+                AttributesImpl attributes = new AttributesImpl();
+                attributes.addAttribute("", "class", "class", "CDATA", 
"embedded");
+                attributes.addAttribute("", "id", "id", "CDATA", 
pstMail.getInternetMessageId());
+                handler.startElement("div", attributes);
+                handler.element("h1", pstMail.getSubject());
+
+                parserMailItem(handler, pstMail, embeddedExtractor);
+                parseMailAttachments(handler, pstMail, embeddedExtractor);
+
+                handler.endElement("div");
+
+                pstMail = (PSTMessage) pstFolder.getNextChild();
+            }
+        }
+
+        if (pstFolder.hasSubfolders()) {
+            for (PSTFolder pstSubFolder : pstFolder.getSubFolders()) {
+                handler.startElement("div", createAttribute("class", 
"email-folder"));
+                handler.element("h1", pstSubFolder.getDisplayName());
+                parseFolder(handler, pstSubFolder, embeddedExtractor);
+                handler.endElement("div");
+            }
+        }
+    }
+
+    private void parserMailItem(XHTMLContentHandler handler, PSTMessage 
pstMail, EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, 
IOException {
+        Metadata mailMetadata = new Metadata();
+        mailMetadata.set(Metadata.RESOURCE_NAME_KEY, 
pstMail.getInternetMessageId());
+        mailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, 
pstMail.getInternetMessageId());
+        mailMetadata.set(TikaCoreProperties.IDENTIFIER, 
pstMail.getInternetMessageId());
+        mailMetadata.set(TikaCoreProperties.TITLE, pstMail.getSubject());
+        mailMetadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName());
+        mailMetadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName());
+        mailMetadata.set(TikaCoreProperties.CREATED, 
pstMail.getCreationTime());
+        mailMetadata.set(TikaCoreProperties.MODIFIED, 
pstMail.getLastModificationTime());
+        mailMetadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment());
+        mailMetadata.set("descriptorNodeId", 
valueOf(pstMail.getDescriptorNodeId()));
+        mailMetadata.set("senderEmailAddress", 
pstMail.getSenderEmailAddress());
+        mailMetadata.set("recipients", pstMail.getRecipientsString());
+        mailMetadata.set("displayTo", pstMail.getDisplayTo());
+        mailMetadata.set("displayCC", pstMail.getDisplayCC());
+        mailMetadata.set("displayBCC", pstMail.getDisplayBCC());
+        mailMetadata.set("importance", valueOf(pstMail.getImportance()));
+        mailMetadata.set("priority", valueOf(pstMail.getPriority()));
+        mailMetadata.set("flagged", valueOf(pstMail.isFlagged()));
+
+        byte[] mailContent = pstMail.getBody().getBytes(UTF_8);
+        embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), 
handler, mailMetadata, true);
+    }
+
+    private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage 
email, EmbeddedDocumentExtractor embeddedExtractor)
+            throws TikaException {
+        int numberOfAttachments = email.getNumberOfAttachments();
+        for (int i = 0; i < numberOfAttachments; i++) {
+            File tempFile = null;
+            try {
+                PSTAttachment attach = email.getAttachment(i);
+
+                // Get the filename; both long and short filenames can be used 
for attachments
+                String filename = attach.getLongFilename();
+                if (filename.isEmpty()) {
+                    filename = attach.getFilename();
+                }
+
+                xhtml.element("p", filename);
+
+                Metadata attachMeta = new Metadata();
+                attachMeta.set(Metadata.RESOURCE_NAME_KEY, filename);
+                attachMeta.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filename);
+                AttributesImpl attributes = new AttributesImpl();
+                attributes.addAttribute("", "class", "class", "CDATA", 
"embedded");
+                attributes.addAttribute("", "id", "id", "CDATA", filename);
+                xhtml.startElement("div", attributes);
+                if (embeddedExtractor.shouldParseEmbedded(attachMeta)) {
+                    TemporaryResources tmp = new TemporaryResources();
+                    try {
+                        TikaInputStream tis = 
TikaInputStream.get(attach.getFileInputStream(), tmp);
+                        embeddedExtractor.parseEmbedded(tis, xhtml, 
attachMeta, true);
+                    } finally {
+                        tmp.dispose();
+                    }
+                }
+                xhtml.endElement("div");
+
+            } catch (Exception e) {
+                throw new TikaException("Unable to unpack document stream", e);
+            } finally {
+                if (tempFile != null)
+                    tempFile.delete();
+            }
+        }
+    }
+
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.tika.parser.html.HtmlEncodingDetector

Added: 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,22 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+org.apache.tika.parser.feed.FeedParser
+org.apache.tika.parser.html.HtmlParser
+org.apache.tika.parser.mail.RFC822Parser
+org.apache.tika.parser.mbox.MboxParser
+org.apache.tika.parser.mbox.OutlookPSTParser
+org.apache.tika.parser.iptc.IptcAnpaParser

Added: 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/org/apache/tika/parser/ctakes/CTAKESConfig.properties
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/org/apache/tika/parser/ctakes/CTAKESConfig.properties?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/org/apache/tika/parser/ctakes/CTAKESConfig.properties
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/org/apache/tika/parser/ctakes/CTAKESConfig.properties
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+aeDescriptorPath=/ctakes-clinical-pipeline/desc/analysis_engine/AggregatePlaintextUMLSProcessor.xml
+text=true
+annotationProps=BEGIN,END,ONTOLOGY_CONCEPT_ARR
+separatorChar=:
+metadata=Study Title,Study Description
+UMLSUser=
+UMLSPass=

Added: 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.feed;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class FeedParserTest {
+    @Test
+    public void testRSSParser() throws Exception {
+        try (InputStream input = FeedParserTest.class.getResourceAsStream(
+                "/test-documents/rsstest.rss")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            ParseContext context = new ParseContext();
+
+            new FeedParser().parse(input, handler, metadata, context);
+
+            String content = handler.toString();
+            assertFalse(content == null);
+
+            assertEquals("Sample RSS File for Junit test",
+                    metadata.get(TikaCoreProperties.DESCRIPTION));
+            assertEquals("TestChannel", 
metadata.get(TikaCoreProperties.TITLE));
+
+            // TODO find a way of testing the paragraphs and anchors
+        }
+    }
+
+
+    @Test
+    public void testAtomParser() throws Exception {
+        try (InputStream input = FeedParserTest.class.getResourceAsStream(
+                "/test-documents/testATOM.atom")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            ParseContext context = new ParseContext();
+
+            new FeedParser().parse(input, handler, metadata, context);
+
+            String content = handler.toString();
+            assertFalse(content == null);
+
+            assertEquals("Sample Atom File for Junit test",
+                    metadata.get(TikaCoreProperties.DESCRIPTION));
+            assertEquals("Test Atom Feed", 
metadata.get(TikaCoreProperties.TITLE));
+
+            // TODO Check some more
+        }
+    }
+
+}

svn commit: r1723223 [30/32] - in /tika/branches/2.x: tika-core/src/test/resources/META-INF/ tika-core/src/test/resources/META-INF/services/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-module/src/ ti...

Reply via email to