Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,808 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.iptc; + +import java.io.IOException; +import java.io.InputStream; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.Locale; +import java.util.Set; +import java.util.TimeZone; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * Parser for IPTC ANPA New Wire Feeds + */ +public class IptcAnpaParser implements Parser { + /** Serial version UID */ + private static final long serialVersionUID = -6062820170212879115L; + + private static final MediaType TYPE = + MediaType.text("vnd.iptc.anpa"); + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.singleton(TYPE); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + HashMap<String,String> properties = this.loadProperties(stream); + this.setMetadata(metadata, properties); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + // TODO: put body content here + xhtml.startElement("p"); + String body = clean(properties.get("body")); + if (body != null) + xhtml.characters(body); + xhtml.endElement("p"); + xhtml.endDocument(); + } + + /** + * @deprecated This method will be removed in Apache Tika 1.0. + */ + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata) + throws IOException, SAXException, TikaException { + parse(stream, handler, metadata, new ParseContext()); + } + + + private int FMT_ANPA_1312 = 0x00; // "NAA 89-3 (ANPA 1312)" + private int FMT_ANPA_UPI = 0x01; // "United Press International ANPA 1312 variant" + private int FMT_ANPA_UPI_DL = 0x02; // "United Press International Down-Load Message" + private int FMT_IPTC_7901 = 0x03; // "IPTC7901 Recommended Message Format" + private int FMT_IPTC_PHOTO = 0x04; // "IPTC-NAA Digital Newsphoto Parameter Record" + private int FMT_IPTC_CHAR = 0x05; // "IPTC Unstructured Character Oriented File Format (UCOFF)" + private int FMT_NITF = 0x06; // "News Industry Text Format (NITF)" + private int FMT_NITF_TT = 0x07; // "Tidningarnas Telegrambyra NITF version (TTNITF DTD)" + private int FMT_NITF_RB = 0x08; // "Ritzaus Bureau NITF version (RBNITF DTD)" + private int FMT_IPTC_AP = 0x09; // "Associated Press news wire format" + private int FMT_IPTC_BLM = 0x0A; // "Bloomberg News news wire format" + private int FMT_IPTC_NYT = 0x0B; // "New York Times news wire format" + private int FMT_IPTC_RTR = 0x0C; // "Reuters news wire format" + + private int FORMAT = FMT_ANPA_1312; // assume the default format to be ANPA-1312 + + private final static char SOH = 0x01; // start of header (ctrl-a) + private final static char STX = 0x02; // start of text (ctrl-b) + private final static char ETX = 0x03; // end of text (ctrl-c) + private final static char EOT = 0x04; // the tab character (ctrl-d) + private final static char SYN = 0x16; // synchronous idle (ctrl-v) + + private final static char BS = 0x08; // the backspace character (used for diacriticals) + private final static char TB = 0x09; // the tab character + private final static char LF = 0x0A; // line feed + private final static char FF = 0x0C; // form feed + private final static char CR = 0x0D; // carriage return + private final static char XQ = 0x11; // device control (ctrl-q) + private final static char XS = 0x13; // device control (ctrl-s) + private final static char FS = 0x1F; // a field delimiter + + private final static char HY = 0x2D; // hyphen + private final static char SP = 0x20; // the blank space + private final static char LT = 0x3C; // less than + private final static char EQ = 0x3D; // less than + private final static char CT = 0x5E; // carat + + private final static char SL = 0x91; // single-quote left + private final static char SR = 0x92; // single-quote right + private final static char DL = 0x93; // double-quote left + private final static char DR = 0x94; // double-quote right + + + /** + * scan the news messsage and store the metadata and data into a map + */ + private HashMap<String,String> loadProperties(InputStream is) { + + HashMap<String,String> properties = new HashMap<String,String>(); + + FORMAT = this.scanFormat(is); + + byte[] residual = this.getSection(is,"residual"); + + byte[] header = this.getSection(is,"header"); + parseHeader(header, properties); + + byte[] body = this.getSection(is,"body"); + parseBody(body, properties); + + byte[] footer = this.getSection(is,"footer"); + parseFooter(footer, properties); + + return (properties); + } + + + private int scanFormat(InputStream is) { + int format = this.FORMAT; + int maxsize = 524288; // 512K + + byte[] buf = new byte[maxsize]; + try { + if (is.markSupported()) { + is.mark(maxsize); + } + int msgsize = is.read(buf); // read in at least the full data + + String message = (new String(buf, UTF_8)).toLowerCase(Locale.ROOT); + // these are not if-then-else, because we want to go from most common + // and fall through to least. this is imperfect, as these tags could + // show up in other agency stories, but i can't find a spec or any + // explicit codes to identify the wire source in the message itself + + if (message.contains("ap-wf")) { + format = this.FMT_IPTC_AP; + } + if (message.contains("reuters")) { + format = this.FMT_IPTC_RTR; + } + if (message.contains("new york times")) { + format = this.FMT_IPTC_NYT; + } + if (message.contains("bloomberg news")) { + format = this.FMT_IPTC_BLM; + } + } + catch (IOException eio) { + // we are in an unstable state + } + + try { + if (is.markSupported()) { + is.reset(); + } + } + catch (IOException eio) { + // we are in an unstable state + } + return (format); + } + + + private void setFormat(int format) { + this.FORMAT = format; + } + + + private String getFormatName() { + + String name = ""; + + if (FORMAT == this.FMT_IPTC_AP) { + name = "Associated Press"; + } + + else if(FORMAT == this.FMT_IPTC_BLM) { + name = "Bloomberg"; + } + + else if(FORMAT == this.FMT_IPTC_NYT) { + name = "New York Times"; + } + + else if(FORMAT == this.FMT_IPTC_RTR) { + name = "Reuters"; + } + + return (name); + } + + + private byte[] getSection(InputStream is, String name) { + + byte[] value = new byte[0]; + + if (name.equals("residual")) { + // the header shouldn't be more than 1k, but just being generous here + int maxsize = 8192; // 8K + byte bstart = SYN; // check for SYN [0x16 : ctrl-v] (may have leftover residue from preceding message) + byte bfinish = SOH; // check for SOH [0x01 : ctrl-a] (typically follows a pair of SYN [0x16 : ctrl-v]) + value = getSection(is, maxsize, bstart, bfinish, true); + } + + else if(name.equals("header")) { + // the header shouldn't be more than 1k, but just being generous here + int maxsize = 8192; // 8K + byte bstart = SOH; // check for SOH [0x01 : ctrl-a] (typically follows a pair of SYN [0x16 : ctrl-v]) + byte bfinish = STX; // check for STX [0x02 : ctrl-b] (marks end of header, beginning of message) + value = getSection(is, maxsize, bstart, bfinish, true); + } + + else if (name.equals("body")) { + // the message shouldn't be more than 16k (?), leaving plenty of space + int maxsize = 524288; // 512K + byte bstart = STX; // check for STX [0x02 : ctrl-b] (marks end of header, beginning of message) + byte bfinish = ETX; // check for ETX [0x03 : ctrl-c] (marks end of message, beginning of footer) + value = getSection(is, maxsize, bstart, bfinish, true); + } + + else if (name.equals("footer")) { + // the footer shouldn't be more than 1k , leaving plenty of space + int maxsize = 8192; // 8K + byte bstart = ETX; // check for ETX [0x03 : ctrl-c] (marks end of message, beginning of footer) + byte bfinish = EOT; // check for EOT [0x04 : ctrl-d] (marks end of transmission) + value = getSection(is, maxsize, bstart, bfinish, true); + } + + return (value); + } + + + private byte[] getSection(InputStream is, int maxsize, byte bstart, byte bfinish, boolean ifincomplete) { + byte[] value = new byte[0]; + + try { + boolean started = false; // check if we have found the start flag + boolean finished = false; // check if we have found the finish flag + int read = 0; // the number of bytes we read + int start = 0; // the position after the start flag + + // TODO: this only pulls back 8K of data on a read, regardless of buffer size + // more nefariously, it caps at a total 8K, through all sections + int streammax = is.available(); + maxsize = Math.min(maxsize, streammax); + + is.mark(maxsize); + byte[] buf = new byte[maxsize]; + int totsize = 0; + int remainder = maxsize - totsize; + while (remainder > 0) { + int msgsize = is.read(buf, maxsize-remainder, maxsize); // read in at least the full data + if (msgsize == -1) { + remainder = msgsize = 0; + } + remainder -= msgsize; + totsize += msgsize; + } + + // scan through the provided input stream + for (read=0; read < totsize; read++) { + byte b = buf[read]; + + if (!started) { + started = (b == bstart); + start = read + 1; + continue; + } + + if (finished = (b == bfinish)) { +/* + is.reset(); + long skipped = is.skip((long)read); + if (skipped != read) { + // we are in an unstable state + } + is.mark(1); + */ + break; + } + + // load from the stream until we run out of characters, or hit the termination byte + continue; + } + + // move the input stream back to where it was initially + is.reset(); + + if (finished) { + // now, we want to reset the stream to be sitting right on top of the finish marker + is.skip(read); + value = new byte[read-start]; + System.arraycopy(buf, start, value, 0, read-start); + } + else { + if (ifincomplete && started) { + // the caller wants anything that was read, and we finished the stream or buffer + value = new byte[read-start]; + System.arraycopy(buf, start, value, 0, read-start); + } + } + } + catch (IOException eio) { + // something invalid occurred, return an empty string + } + + return (value); + } + + + private boolean parseHeader(byte[] value, HashMap<String,String> properties) { + boolean added = false; + + String env_serviceid = ""; + String env_category = ""; + String env_urgency = ""; + String hdr_edcode = ""; + String hdr_subject = ""; + String hdr_date = ""; + String hdr_time = ""; + + int read = 0; + + while (read < value.length) { + + // pull apart the envelope, getting the service id (....\x1f) + while (read < value.length) { + byte val_next = value[read++]; + if (val_next != FS) { + env_serviceid += (char)(val_next & 0xff); // convert the byte to an unsigned int + } + else { + break; + } + } + + // pull apart the envelope, getting the category (....\x13\x11) + while (read < value.length) { + byte val_next = value[read++]; + if (val_next != XS) { // the end of the envelope is marked (\x13) + env_category += (char)(val_next & 0xff); // convert the byte to an unsigned int + } + else { + val_next = value[read]; // get the remaining byte (\x11) + if (val_next == XQ) { + read++; + } + break; + } + } + + // pull apart the envelope, getting the subject heading + while (read < value.length) { + boolean subject = true; + byte val_next = value[read++]; + while ((subject) && (val_next != SP) && (val_next != 0x00)) { // ignore the envelope subject + hdr_subject += (char)(val_next & 0xff); // convert the byte to an unsigned int + val_next = (read < value.length) ? value[read++] : 0x00; + while (val_next == SP) { // consume all the spaces + subject = false; + val_next = (read < value.length) ? value[read++] : 0x00; + if (val_next != SP) { + --read; // otherwise we eat into the next section + } + } + } + if (!subject) { + break; + } + } + + // pull apart the envelope, getting the date and time + while (read < value.length) { + byte val_next = value[read++]; + if (hdr_date.length() == 0) { + while (((val_next >= (byte)0x30) && (val_next <= (byte)0x39)) // consume all numerics and hyphens + || (val_next == HY)) { + hdr_date += (char)(val_next & 0xff); // convert the byte to an unsigned int + val_next = (read < value.length) ? value[read++] : 0x00; + } + } + else if (val_next == SP) { + while (val_next == SP) { // consume all the spaces + val_next = (read < value.length) ? value[read++] : 0x00; + } + continue; + } + else { + while (((val_next >= (byte)0x30) && (val_next <= (byte)0x39)) // consume all numerics and hyphens + || (val_next == HY)) { + hdr_time += (char)(val_next & 0xff); // convert the byte to an unsigned int + val_next = (read < value.length) ? value[read++] : 0x00; + } + } + } + break; // don't let this run back through and start thrashing metadata + } + + // if we were saving any of these values, we would set the properties map here + + added = (env_serviceid.length() + env_category.length() + hdr_subject.length() + + hdr_date.length() + hdr_time.length()) > 0; + return added; + } + + private boolean parseBody(byte[] value, HashMap<String,String> properties) { + boolean added = false; + + String bdy_heading = ""; + String bdy_title = ""; + String bdy_source = ""; + String bdy_author = ""; + String bdy_body = ""; + + int read = 0; + boolean done = false; + + while (!done && (read < value.length)) { + + // pull apart the body, getting the heading (^....\x0d\x0a) + while (read < value.length) { + byte val_next = value[read++]; + if (val_next == CT) { // start of a new section , first is the heading + val_next = (read < value.length) ? value[read++] : 0x00; + // AP, NYT, and Bloomberg end with < , Reuters with EOL + while ((val_next != LT) && (val_next != CR) && (val_next != LF)) { // less than delimiter (\x3c) and not EOL + bdy_heading += (char)(val_next & 0xff); // convert the byte to an unsigned int + val_next = (read < value.length) ? value[read++] : 0x00; + if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE + } + if (val_next == LT) { + // hit the delimiter, carry on + val_next = (read < value.length) ? value[read++] : 0x00; + } + while (bdy_heading.length() > 0 && ((val_next == CR) || (val_next == LF))) { + val_next = (read < value.length) ? value[read++] : 0x00; // skip the new lines + if ((val_next != CR) && (val_next != LF)) { + --read; + } + } + } + else { + // this will only be hit on poorly-formed files + + // for reuters, the heading does not start with the ^, so we push one back into the stream + if (FORMAT == this.FMT_IPTC_RTR) { + if (val_next != CT) { + // for any non-whitespace, we need to go back an additional step to non destroy the data + if ((val_next != SP) && (val_next != TB) && (val_next != CR) && (val_next != LF)) { + // if the very first byte is data, we have to shift the whole array, and stuff in a carat + if (read == 1) { + byte[] resize = new byte[value.length + 1]; + System.arraycopy(value, 0, resize, 1, value.length); + value = resize; + } + } + value[--read] = CT; + continue; + } + } + } + break; + } + + // pull apart the body, getting the title (^....\x0d\x0a) + while (read < value.length) { + byte val_next = value[read++]; + if (val_next == CT) { // start of a new section , first is the heading + val_next = (read < value.length) ? value[read++] : 0x00; + // AP, NYT, and Bloomberg end with < , Reuters with EOL + while ((val_next != LT) && (val_next != CT) && (val_next != CR) && (val_next != LF)) { // less than delimiter (\x3c), or carat (\x5e) and not EOL + bdy_title += (char)(val_next & 0xff); // convert the byte to an unsigned int + val_next = (read < value.length) ? value[read++] : 0x00; + if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE + } + + if (val_next == CT) { // start of a new section , when first didn't finish cleanly + --read; + } + + if (val_next == LT) { + // hit the delimiter, carry on + val_next = (read < value.length) ? value[read++] : 0x00; + } + + while (bdy_title.length() > 0 && ((val_next == CR) || (val_next == LF))) { + val_next = (read < value.length) ? value[read++] : 0x00; // skip the new lines + if ((val_next != CR) && (val_next != LF)) { + --read; + } + } + } + else { + // this will only be hit on poorly-formed files + + // for bloomberg, the title does not start with the ^, so we push one back into the stream + if (FORMAT == this.FMT_IPTC_BLM) { + if (val_next == TB) { + value[--read] = CT; + continue; + } + } + + // for reuters, the title does not start with the ^, so we push one back into the stream + if (FORMAT == this.FMT_IPTC_RTR) { + if (val_next != CT) { + // for any non-whitespace, we need to go back an additional step to non destroy the data + if ((val_next != SP) && (val_next != TB) && (val_next != CR) && (val_next != LF)) { + --read; + } + value[--read] = CT; + continue; + } + } + } + break; + } + + + // at this point, we have a variable number of metadata lines, with various orders + // we scan the start of each line for the special character, and run to the end character + // pull apart the body, getting the title (^....\x0d\x0a) + boolean metastarted = false; + String longline = ""; + String longkey = ""; + while (read < value.length) { + byte val_next = value[read++]; + + // eat up whitespace before committing to the next section + if ((val_next == SP) || (val_next == TB) || (val_next == CR) || (val_next == LF)) { + continue; + } + + if (val_next == CT) { // start of a new section , could be authors, sources, etc + val_next = (read < value.length) ? value[read++] : 0x00; + String tmp_line = ""; + while ((val_next != LT) && (val_next != CT) && (val_next != CR) && (val_next != LF) && (val_next != 0)) { + // less than delimiter (\x3c), maybe also badly formed with just new line + tmp_line += (char)(val_next & 0xff); // convert the byte to an unsigned int + val_next = (read < value.length) ? value[read++] : 0x00; + if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE + } + + if (val_next == CT) { // start of a new section , when first didn't finish cleanly + --read; + } + + if (val_next == LT) { + // hit the delimiter, carry on + val_next = (read < value.length) ? value[read++] : 0x00; + } + + while ((val_next == CR) || (val_next == LF)) { + val_next = (read < value.length) ? value[read++] : 0x00; // skip the new lines + if ((val_next != CR) && (val_next != LF)) { + --read; + } + } + if (tmp_line.toLowerCase(Locale.ROOT).startsWith("by") || longline.equals("bdy_author")) { + longkey = "bdy_author"; + + // prepend a space to subsequent line, so it gets parsed consistent with the lead line + tmp_line = (longline.equals(longkey) ? " " : "") + tmp_line; + + // we have an author candidate + int term = tmp_line.length(); + term = Math.min(term, (tmp_line.contains("<") ? tmp_line.indexOf("<") : term)); + term = Math.min(term, (tmp_line.contains("=") ? tmp_line.indexOf("=") : term)); + term = Math.min(term, (tmp_line.contains("\n") ? tmp_line.indexOf("\n") : term)); + term = (term > 0 ) ? term : tmp_line.length(); + bdy_author += tmp_line.substring(tmp_line.indexOf(" "), term); + metastarted = true; + longline = ((tmp_line.contains("=")) && (!longline.equals(longkey)) ? longkey : ""); + } + else if (FORMAT == this.FMT_IPTC_BLM) { + String byline = " by "; + if (tmp_line.toLowerCase(Locale.ROOT).contains(byline)) { + longkey = "bdy_author"; + + int term = tmp_line.length(); + term = Math.min(term, (tmp_line.contains("<") ? tmp_line.indexOf("<") : term)); + term = Math.min(term, (tmp_line.contains("=") ? tmp_line.indexOf("=") : term)); + term = Math.min(term, (tmp_line.contains("\n") ? tmp_line.indexOf("\n") : term)); + term = (term > 0 ) ? term : tmp_line.length(); + // for bloomberg, the author line sits below their copyright statement + bdy_author += tmp_line.substring(tmp_line.toLowerCase(Locale.ROOT).indexOf(byline) + byline.length(), term) + " "; + metastarted = true; + longline = ((tmp_line.contains("=")) && (!longline.equals(longkey)) ? longkey : ""); + } + else if(tmp_line.toLowerCase(Locale.ROOT).startsWith("c.")) { + // the author line for bloomberg is a multiline starting with c.2011 Bloomberg News + // then containing the author info on the next line + if (val_next == TB) { + value[--read] = CT; + continue; + } + } + else if(tmp_line.toLowerCase(Locale.ROOT).trim().startsWith("(") && tmp_line.toLowerCase(Locale.ROOT).trim().endsWith(")")) { + // the author line may have one or more comment lines between the copyright + // statement, and the By AUTHORNAME line + if (val_next == TB) { + value[--read] = CT; + continue; + } + } + } + + else if (tmp_line.toLowerCase(Locale.ROOT).startsWith("eds") || longline.equals("bdy_source")) { + longkey = "bdy_source"; + // prepend a space to subsequent line, so it gets parsed consistent with the lead line + tmp_line = (longline.equals(longkey) ? " " : "") + tmp_line; + + // we have a source candidate + int term = tmp_line.length(); + term = Math.min(term, (tmp_line.contains("<") ? tmp_line.indexOf("<") : term)); + term = Math.min(term, (tmp_line.contains("=") ? tmp_line.indexOf("=") : term)); +// term = Math.min(term, (tmp_line.indexOf("\n") > -1 ? tmp_line.indexOf("\n") : term)); + term = (term > 0 ) ? term : tmp_line.length(); + bdy_source += tmp_line.substring(tmp_line.indexOf(" ") + 1, term) + " "; + metastarted = true; + longline = (!longline.equals(longkey) ? longkey : ""); + } + else { + // this has fallen all the way through. trap it as part of the subject, + // rather than just losing it + if (!metastarted) { + bdy_title += " , " + tmp_line; // not sure where else to put this but in the title + } + else { + // what to do with stuff that is metadata, which falls after metadata lines started? + bdy_body += " " + tmp_line + " , "; // not sure where else to put this but in the title + } + } + } + else { // we're on to the main body + while ((read < value.length) && (val_next != 0)) { + // read until the train runs out of tracks + bdy_body += (char)(val_next & 0xff); // convert the byte to an unsigned int + val_next = (read < value.length) ? value[read++] : 0x00; + if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE + } + + } + // we would normally break here, but just let this read out to the end + } + done = true; // don't let this run back through and start thrashing metadata + } + properties.put("body", bdy_body); + properties.put("title", bdy_title); + properties.put("subject", bdy_heading); + properties.put("author", bdy_author); + properties.put("source", bdy_source); + + added = (bdy_body.length() + bdy_title.length() + bdy_heading.length() + bdy_author.length() + + bdy_source.length()) > 0; + return added; + } + + + private boolean parseFooter(byte[] value, HashMap<String,String> properties) { + boolean added = false; + + String ftr_source = ""; + String ftr_datetime = ""; + + int read = 0; + boolean done = false; + + while (!done && (read < value.length)) { + + // pull apart the footer, getting the news feed source (^....\x0d\x0a) + byte val_next = value[read++]; + byte val_peek = (read < value.length) ? value[read+1] : 0x00; // skip the new lines + + while (((val_next < (byte)0x30) || (val_next > (byte)0x39)) && (val_next != 0)) { // consume all non-numerics first + ftr_source += (char)(val_next & 0xff); // convert the byte to an unsigned int + val_next = (read < value.length) ? value[read] : 0x00; // attempt to read until end of stream + read++; + if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE + } + + while ((val_next != LT) && (val_next != CR) && (val_next != LF) && (val_next != 0)) { // get as much timedate as possible + // this is an american format, so arrives as mm-dd-yy HHiizzz + ftr_datetime += (char)(val_next & 0xff); // convert the byte to an unsigned int + val_next = (read < value.length) ? value[read++] : 0x00; // skip the new lines + if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE + } + if (val_next == LT) { + // hit the delimiter, carry on + val_next = (read < value.length) ? value[read++] : 0x00; + } + + if (ftr_datetime.length() > 0) { + // we want to pass this back in a more friendly format + String format_out = "yyyy-MM-dd'T'HH:mm:ss'Z'"; + Date dateunix = new Date(); + try { + // standard ap format + String format_in = "MM-dd-yy HHmmzzz"; + + if (FORMAT == this.FMT_IPTC_RTR) { + // standard reuters format + format_in = "HH:mm MM-dd-yy"; + } + SimpleDateFormat dfi = new SimpleDateFormat(format_in, Locale.ROOT); + dfi.setTimeZone(TimeZone.getTimeZone("UTC")); + dateunix = dfi.parse(ftr_datetime); + } + catch (ParseException ep) { + // failed, but this will just fall through to setting the date to now + } + SimpleDateFormat dfo = new SimpleDateFormat(format_out, Locale.ROOT); + dfo.setTimeZone(TimeZone.getTimeZone("UTC")); + ftr_datetime = dfo.format(dateunix); + } + while ((val_next == CR) || (val_next == LF)) { + val_next = (read < value.length) ? value[read++] : 0x00; // skip the new lines + if ((val_next != CR) && (val_next != LF)) { + --read; + } + } + done = true; // don't let this run back through and start thrashing metadata + } + + properties.put("publisher", ftr_source); + properties.put("created", ftr_datetime); + properties.put("modified", ftr_datetime); + + added = (ftr_source.length() + ftr_datetime.length()) > 0; + return added; + } + + + private void setMetadata(Metadata metadata, HashMap<String,String> properties) { + + // every property that gets set must be non-null, or it will cause NPE + // in other consuming applications, like Lucene + metadata.set(Metadata.CONTENT_TYPE, clean("text/anpa-1312")); + metadata.set(TikaCoreProperties.TITLE, clean(properties.get("title"))); + metadata.set(TikaCoreProperties.KEYWORDS, clean(properties.get("subject"))); + metadata.set(TikaCoreProperties.CREATOR, clean(properties.get("author"))); + metadata.set(TikaCoreProperties.CREATED, clean(properties.get("created"))); + metadata.set(TikaCoreProperties.MODIFIED, clean(properties.get("modified"))); + metadata.set(TikaCoreProperties.SOURCE, clean(properties.get("source"))); +// metadata.set(TikaCoreProperties.PUBLISHER, clean(properties.get("publisher"))); + metadata.set(TikaCoreProperties.PUBLISHER, clean(this.getFormatName())); + +/* + metadata.set(TikaCoreProperties.DATE, font.getHeader().getCreated().getTime()); + metadata.set( + Property.internalDate(TikaCoreProperties.MODIFIED), + font.getHeader().getModified().getTime()); +*/ + } + + private String clean(String value) { + if (value == null) { + value = ""; + } + + value = value.replaceAll("``", "`"); + value = value.replaceAll("''", "'"); + value = value.replaceAll(new String(new char[] {SL}), "'"); + value = value.replaceAll(new String(new char[] {SR}), "'"); + value = value.replaceAll(new String(new char[] {DL}), "\""); + value = value.replaceAll(new String(new char[] {DR}), "\""); + value = value.trim(); + + return (value); + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mail; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.james.mime4j.MimeException; +import org.apache.james.mime4j.codec.DecodeMonitor; +import org.apache.james.mime4j.codec.DecoderUtil; +import org.apache.james.mime4j.dom.address.Address; +import org.apache.james.mime4j.dom.address.AddressList; +import org.apache.james.mime4j.dom.address.Mailbox; +import org.apache.james.mime4j.dom.address.MailboxList; +import org.apache.james.mime4j.dom.field.AddressListField; +import org.apache.james.mime4j.dom.field.DateTimeField; +import org.apache.james.mime4j.dom.field.MailboxListField; +import org.apache.james.mime4j.dom.field.ParsedField; +import org.apache.james.mime4j.dom.field.UnstructuredField; +import org.apache.james.mime4j.field.LenientFieldParser; +import org.apache.james.mime4j.parser.ContentHandler; +import org.apache.james.mime4j.stream.BodyDescriptor; +import org.apache.james.mime4j.stream.Field; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +/** + * Bridge between mime4j's content handler and the generic Sax content handler + * used by Tika. See + * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ContentHandler.html + */ +class MailContentHandler implements ContentHandler { + + private boolean strictParsing = false; + + private XHTMLContentHandler handler; + private Metadata metadata; + private EmbeddedDocumentExtractor extractor; + + private boolean inPart = false; + + MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata, ParseContext context, boolean strictParsing) { + this.handler = xhtml; + this.metadata = metadata; + this.strictParsing = strictParsing; + + // Fetch / Build an EmbeddedDocumentExtractor with which + // to handle/process the parts/attachments + + // Was an EmbeddedDocumentExtractor explicitly supplied? + this.extractor = context.get(EmbeddedDocumentExtractor.class); + + // If there's no EmbeddedDocumentExtractor, then try using a normal parser + // This will ensure that the contents are made available to the user, so + // the see the text, but without fine-grained control/extraction + // (This also maintains backward compatibility with older versions!) + if (this.extractor == null) { + // If the user gave a parser, use that, if not the default + Parser parser = context.get(AutoDetectParser.class); + if (parser == null) { + parser = context.get(Parser.class); + } + if (parser == null) { + TikaConfig tikaConfig = context.get(TikaConfig.class); + if (tikaConfig == null) { + tikaConfig = TikaConfig.getDefaultConfig(); + } + parser = new AutoDetectParser(tikaConfig.getParser()); + } + ParseContext ctx = new ParseContext(); + ctx.set(Parser.class, parser); + extractor = new ParsingEmbeddedDocumentExtractor(ctx); + } + } + + public void body(BodyDescriptor body, InputStream is) throws MimeException, + IOException { + // use a different metadata object + // in order to specify the mime type of the + // sub part without damaging the main metadata + + Metadata submd = new Metadata(); + submd.set(Metadata.CONTENT_TYPE, body.getMimeType()); + submd.set(Metadata.CONTENT_ENCODING, body.getCharset()); + + try { + if (extractor.shouldParseEmbedded(submd)) { + extractor.parseEmbedded(is, handler, submd, false); + } + } catch (SAXException e) { + throw new MimeException(e); + } + } + + public void endBodyPart() throws MimeException { + try { + handler.endElement("p"); + handler.endElement("div"); + } catch (SAXException e) { + throw new MimeException(e); + } + } + + public void endHeader() throws MimeException { + } + + public void startMessage() throws MimeException { + try { + handler.startDocument(); + } catch (SAXException e) { + throw new MimeException(e); + } + } + + public void endMessage() throws MimeException { + try { + handler.endDocument(); + } catch (SAXException e) { + throw new MimeException(e); + } + } + + public void endMultipart() throws MimeException { + inPart = false; + } + + public void epilogue(InputStream is) throws MimeException, IOException { + } + + /** + * Header for the whole message or its parts + * + * @see http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ + * Field.html + */ + public void field(Field field) throws MimeException { + // inPart indicates whether these metadata correspond to the + // whole message or its parts + if (inPart) { + return; + } + + try { + String fieldname = field.getName(); + ParsedField parsedField = LenientFieldParser.getParser().parse( + field, DecodeMonitor.SILENT); + if (fieldname.equalsIgnoreCase("From")) { + MailboxListField fromField = (MailboxListField) parsedField; + MailboxList mailboxList = fromField.getMailboxList(); + if (fromField.isValidField() && mailboxList != null) { + for (Address address : mailboxList) { + String from = getDisplayString(address); + metadata.add(Metadata.MESSAGE_FROM, from); + metadata.add(TikaCoreProperties.CREATOR, from); + } + } else { + String from = stripOutFieldPrefix(field, "From:"); + if (from.startsWith("<")) { + from = from.substring(1); + } + if (from.endsWith(">")) { + from = from.substring(0, from.length() - 1); + } + metadata.add(Metadata.MESSAGE_FROM, from); + metadata.add(TikaCoreProperties.CREATOR, from); + } + } else if (fieldname.equalsIgnoreCase("Subject")) { + metadata.add(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE, + ((UnstructuredField) parsedField).getValue()); + } else if (fieldname.equalsIgnoreCase("To")) { + processAddressList(parsedField, "To:", Metadata.MESSAGE_TO); + } else if (fieldname.equalsIgnoreCase("CC")) { + processAddressList(parsedField, "Cc:", Metadata.MESSAGE_CC); + } else if (fieldname.equalsIgnoreCase("BCC")) { + processAddressList(parsedField, "Bcc:", Metadata.MESSAGE_BCC); + } else if (fieldname.equalsIgnoreCase("Date")) { + DateTimeField dateField = (DateTimeField) parsedField; + metadata.set(TikaCoreProperties.CREATED, dateField.getDate()); + } + } catch (RuntimeException me) { + if (strictParsing) { + throw me; + } + } + } + + private void processAddressList(ParsedField field, String addressListType, + String metadataField) throws MimeException { + AddressListField toField = (AddressListField) field; + if (toField.isValidField()) { + AddressList addressList = toField.getAddressList(); + for (Address address : addressList) { + metadata.add(metadataField, getDisplayString(address)); + } + } else { + String to = stripOutFieldPrefix(field, + addressListType); + for (String eachTo : to.split(",")) { + metadata.add(metadataField, eachTo.trim()); + } + } + } + + private String getDisplayString(Address address) { + if (address instanceof Mailbox) { + Mailbox mailbox = (Mailbox) address; + String name = mailbox.getName(); + if (name != null && name.length() > 0) { + name = DecoderUtil.decodeEncodedWords(name, DecodeMonitor.SILENT); + return name + " <" + mailbox.getAddress() + ">"; + } else { + return mailbox.getAddress(); + } + } else { + return address.toString(); + } + } + + public void preamble(InputStream is) throws MimeException, IOException { + } + + public void raw(InputStream is) throws MimeException, IOException { + } + + public void startBodyPart() throws MimeException { + try { + handler.startElement("div", "class", "email-entry"); + handler.startElement("p"); + } catch (SAXException e) { + throw new MimeException(e); + } + } + + public void startHeader() throws MimeException { + // TODO Auto-generated method stub + + } + + public void startMultipart(BodyDescriptor descr) throws MimeException { + inPart = true; + } + + private String stripOutFieldPrefix(Field field, String fieldname) { + String temp = field.getRaw().toString(); + int loc = fieldname.length(); + while (temp.charAt(loc) == ' ') { + loc++; + } + return temp.substring(loc); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mail; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; + +import org.apache.james.mime4j.MimeException; +import org.apache.james.mime4j.parser.MimeStreamParser; +import org.apache.james.mime4j.stream.MimeConfig; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Uses apache-mime4j to parse emails. Each part is treated with the + * corresponding parser and displayed within elements. + * <p/> + * A {@link MimeEntityConfig} object can be passed in the parsing context + * to better control the parsing process. + * + * @author [email protected] + */ +public class RFC822Parser extends AbstractParser { + /** + * Serial version UID + */ + private static final long serialVersionUID = -5504243905998074168L; + + private static final Set<MediaType> SUPPORTED_TYPES = Collections + .singleton(MediaType.parse("message/rfc822")); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + // Get the mime4j configuration, or use a default one + MimeConfig config = new MimeConfig(); + config.setMaxLineLen(100000); + config.setMaxHeaderLen(100000); // max length of any individual header + config = context.get(MimeConfig.class, config); + + MimeStreamParser parser = new MimeStreamParser(config); + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + + MailContentHandler mch = new MailContentHandler( + xhtml, metadata, context, config.isStrictParsing()); + parser.setContentHandler(mch); + parser.setContentDecoding(true); + + TikaInputStream tstream = TikaInputStream.get(stream); + try { + parser.parse(tstream); + } catch (IOException e) { + tstream.throwIfCauseOf(e); + throw new TikaException("Failed to parse an email message", e); + } catch (MimeException e) { + // Unwrap the exception in case it was not thrown by mime4j + Throwable cause = e.getCause(); + if (cause instanceof TikaException) { + throw (TikaException) cause; + } else if (cause instanceof SAXException) { + throw (SAXException) cause; + } else { + throw new TikaException("Failed to parse an email message", e); + } + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,209 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mbox; + +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.Locale; +import java.util.Map; +import java.util.Queue; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Mbox (mailbox) parser. This version extracts each mail from Mbox and uses the + * DelegatingParser to process each mail. + */ +public class MboxParser extends AbstractParser { + + public static final String MBOX_MIME_TYPE = "application/mbox"; + public static final String MBOX_RECORD_DIVIDER = "From "; + public static final int MAIL_MAX_SIZE = 50000000; + /** + * Serial version UID + */ + private static final long serialVersionUID = -1762689436731160661L; + private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("mbox")); + private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)"); + private static final Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile("<(.*@.*)>"); + + private static final String EMAIL_HEADER_METADATA_PREFIX = "MboxParser-"; + private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from"; + private final Map<Integer, Metadata> trackingMetadata = new HashMap<Integer, Metadata>(); + private boolean tracking = false; + + public static Date parseDate(String headerContent) throws ParseException { + SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US); + return dateFormat.parse(headerContent); + } + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, TikaException, SAXException { + + EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class, + new ParsingEmbeddedDocumentExtractor(context)); + + String charsetName = "windows-1252"; + + metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE); + metadata.set(Metadata.CONTENT_ENCODING, charsetName); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + InputStreamReader isr = new InputStreamReader(stream, charsetName); + try (BufferedReader reader = new BufferedReader(isr)) { + String curLine = reader.readLine(); + int mailItem = 0; + do { + if (curLine.startsWith(MBOX_RECORD_DIVIDER)) { + Metadata mailMetadata = new Metadata(); + Queue<String> multiline = new LinkedList<String>(); + mailMetadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length())); + mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822"); + curLine = reader.readLine(); + + ByteArrayOutputStream message = new ByteArrayOutputStream(100000); + do { + if (curLine.startsWith(" ") || curLine.startsWith("\t")) { + String latestLine = multiline.poll(); + latestLine += " " + curLine.trim(); + multiline.add(latestLine); + } else { + multiline.add(curLine); + } + + message.write(curLine.getBytes(charsetName)); + message.write(0x0A); + curLine = reader.readLine(); + } + while (curLine != null && !curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE); + + for (String item : multiline) { + saveHeaderInMetadata(mailMetadata, item); + } + + ByteArrayInputStream messageStream = new ByteArrayInputStream(message.toByteArray()); + message = null; + + if (extractor.shouldParseEmbedded(mailMetadata)) { + extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true); + } + + if (tracking) { + getTrackingMetadata().put(mailItem++, mailMetadata); + } + } else { + curLine = reader.readLine(); + } + + } while (curLine != null && !Thread.currentThread().isInterrupted()); + } + + xhtml.endDocument(); + } + + public boolean isTracking() { + return tracking; + } + + public void setTracking(boolean tracking) { + this.tracking = tracking; + } + + public Map<Integer, Metadata> getTrackingMetadata() { + return trackingMetadata; + } + + private void saveHeaderInMetadata(Metadata metadata, String curLine) { + Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine); + if (!headerMatcher.matches()) { + return; // ignore malformed header lines + } + + String headerTag = headerMatcher.group(1).toLowerCase(Locale.ROOT); + String headerContent = headerMatcher.group(2); + + if (headerTag.equalsIgnoreCase("From")) { + metadata.set(TikaCoreProperties.CREATOR, headerContent); + } else if (headerTag.equalsIgnoreCase("To") || headerTag.equalsIgnoreCase("Cc") + || headerTag.equalsIgnoreCase("Bcc")) { + Matcher address = EMAIL_ADDRESS_PATTERN.matcher(headerContent); + if (address.find()) { + metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, address.group(1)); + } else if (headerContent.indexOf('@') > -1) { + metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, headerContent); + } + + String property = Metadata.MESSAGE_TO; + if (headerTag.equalsIgnoreCase("Cc")) { + property = Metadata.MESSAGE_CC; + } else if (headerTag.equalsIgnoreCase("Bcc")) { + property = Metadata.MESSAGE_BCC; + } + metadata.add(property, headerContent); + } else if (headerTag.equalsIgnoreCase("Subject")) { + metadata.add(Metadata.SUBJECT, headerContent); + } else if (headerTag.equalsIgnoreCase("Date")) { + try { + Date date = parseDate(headerContent); + metadata.set(TikaCoreProperties.CREATED, date); + } catch (ParseException e) { + // ignoring date because format was not understood + } + } else if (headerTag.equalsIgnoreCase("Message-Id")) { + metadata.set(TikaCoreProperties.IDENTIFIER, headerContent); + } else if (headerTag.equalsIgnoreCase("In-Reply-To")) { + metadata.set(TikaCoreProperties.RELATION, headerContent); + } else if (headerTag.equalsIgnoreCase("Content-Type")) { + // TODO - key off content-type in headers to + // set mapping to use for content and convert if necessary. + + metadata.add(Metadata.CONTENT_TYPE, headerContent); + metadata.set(TikaCoreProperties.FORMAT, headerContent); + } else { + metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mbox; + +import static java.lang.String.valueOf; +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.util.Collections.singleton; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.Set; + +import com.pff.PSTAttachment; +import com.pff.PSTFile; +import com.pff.PSTFolder; +import com.pff.PSTMessage; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +/** + * Parser for MS Outlook PST email storage files + */ +public class OutlookPSTParser extends AbstractParser { + + private static final long serialVersionUID = 620998217748364063L; + + public static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application("vnd.ms-outlook-pst"); + private static final Set<MediaType> SUPPORTED_TYPES = singleton(MS_OUTLOOK_PST_MIMETYPE); + + private static AttributesImpl createAttribute(String attName, String attValue) { + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", attName, attName, "CDATA", attValue); + return attributes; + } + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + // Use the delegate parser to parse the contained document + EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class, + new ParsingEmbeddedDocumentExtractor(context)); + + metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString()); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + TikaInputStream in = TikaInputStream.get(stream); + PSTFile pstFile = null; + try { + pstFile = new PSTFile(in.getFile().getPath()); + metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length())); + boolean isValid = pstFile.getFileHandle().getFD().valid(); + metadata.set("isValid", valueOf(isValid)); + if (isValid) { + parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor); + } + } catch (Exception e) { + throw new TikaException(e.getMessage(), e); + } finally { + if (pstFile != null && pstFile.getFileHandle() != null) { + try { + pstFile.getFileHandle().close(); + } catch (IOException e) { + //swallow closing exception + } + } + } + + xhtml.endDocument(); + } + + private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, EmbeddedDocumentExtractor embeddedExtractor) + throws Exception { + if (pstFolder.getContentCount() > 0) { + PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild(); + while (pstMail != null) { + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId()); + handler.startElement("div", attributes); + handler.element("h1", pstMail.getSubject()); + + parserMailItem(handler, pstMail, embeddedExtractor); + parseMailAttachments(handler, pstMail, embeddedExtractor); + + handler.endElement("div"); + + pstMail = (PSTMessage) pstFolder.getNextChild(); + } + } + + if (pstFolder.hasSubfolders()) { + for (PSTFolder pstSubFolder : pstFolder.getSubFolders()) { + handler.startElement("div", createAttribute("class", "email-folder")); + handler.element("h1", pstSubFolder.getDisplayName()); + parseFolder(handler, pstSubFolder, embeddedExtractor); + handler.endElement("div"); + } + } + } + + private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException { + Metadata mailMetadata = new Metadata(); + mailMetadata.set(Metadata.RESOURCE_NAME_KEY, pstMail.getInternetMessageId()); + mailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pstMail.getInternetMessageId()); + mailMetadata.set(TikaCoreProperties.IDENTIFIER, pstMail.getInternetMessageId()); + mailMetadata.set(TikaCoreProperties.TITLE, pstMail.getSubject()); + mailMetadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName()); + mailMetadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName()); + mailMetadata.set(TikaCoreProperties.CREATED, pstMail.getCreationTime()); + mailMetadata.set(TikaCoreProperties.MODIFIED, pstMail.getLastModificationTime()); + mailMetadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment()); + mailMetadata.set("descriptorNodeId", valueOf(pstMail.getDescriptorNodeId())); + mailMetadata.set("senderEmailAddress", pstMail.getSenderEmailAddress()); + mailMetadata.set("recipients", pstMail.getRecipientsString()); + mailMetadata.set("displayTo", pstMail.getDisplayTo()); + mailMetadata.set("displayCC", pstMail.getDisplayCC()); + mailMetadata.set("displayBCC", pstMail.getDisplayBCC()); + mailMetadata.set("importance", valueOf(pstMail.getImportance())); + mailMetadata.set("priority", valueOf(pstMail.getPriority())); + mailMetadata.set("flagged", valueOf(pstMail.isFlagged())); + + byte[] mailContent = pstMail.getBody().getBytes(UTF_8); + embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), handler, mailMetadata, true); + } + + private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email, EmbeddedDocumentExtractor embeddedExtractor) + throws TikaException { + int numberOfAttachments = email.getNumberOfAttachments(); + for (int i = 0; i < numberOfAttachments; i++) { + File tempFile = null; + try { + PSTAttachment attach = email.getAttachment(i); + + // Get the filename; both long and short filenames can be used for attachments + String filename = attach.getLongFilename(); + if (filename.isEmpty()) { + filename = attach.getFilename(); + } + + xhtml.element("p", filename); + + Metadata attachMeta = new Metadata(); + attachMeta.set(Metadata.RESOURCE_NAME_KEY, filename); + attachMeta.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filename); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", filename); + xhtml.startElement("div", attributes); + if (embeddedExtractor.shouldParseEmbedded(attachMeta)) { + TemporaryResources tmp = new TemporaryResources(); + try { + TikaInputStream tis = TikaInputStream.get(attach.getFileInputStream(), tmp); + embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, true); + } finally { + tmp.dispose(); + } + } + xhtml.endElement("div"); + + } catch (Exception e) { + throw new TikaException("Unable to unpack document stream", e); + } finally { + if (tempFile != null) + tempFile.delete(); + } + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector Wed Jan 6 03:50:50 2016 @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.tika.parser.html.HtmlEncodingDetector Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Jan 6 03:50:50 2016 @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +org.apache.tika.parser.feed.FeedParser +org.apache.tika.parser.html.HtmlParser +org.apache.tika.parser.mail.RFC822Parser +org.apache.tika.parser.mbox.MboxParser +org.apache.tika.parser.mbox.OutlookPSTParser +org.apache.tika.parser.iptc.IptcAnpaParser Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/org/apache/tika/parser/ctakes/CTAKESConfig.properties URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/org/apache/tika/parser/ctakes/CTAKESConfig.properties?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/org/apache/tika/parser/ctakes/CTAKESConfig.properties (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/resources/org/apache/tika/parser/ctakes/CTAKESConfig.properties Wed Jan 6 03:50:50 2016 @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +aeDescriptorPath=/ctakes-clinical-pipeline/desc/analysis_engine/AggregatePlaintextUMLSProcessor.xml +text=true +annotationProps=BEGIN,END,ONTOLOGY_CONCEPT_ARR +separatorChar=: +metadata=Study Title,Study Description +UMLSUser= +UMLSPass= Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/feed/FeedParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,75 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.feed; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class FeedParserTest { + @Test + public void testRSSParser() throws Exception { + try (InputStream input = FeedParserTest.class.getResourceAsStream( + "/test-documents/rsstest.rss")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + ParseContext context = new ParseContext(); + + new FeedParser().parse(input, handler, metadata, context); + + String content = handler.toString(); + assertFalse(content == null); + + assertEquals("Sample RSS File for Junit test", + metadata.get(TikaCoreProperties.DESCRIPTION)); + assertEquals("TestChannel", metadata.get(TikaCoreProperties.TITLE)); + + // TODO find a way of testing the paragraphs and anchors + } + } + + + @Test + public void testAtomParser() throws Exception { + try (InputStream input = FeedParserTest.class.getResourceAsStream( + "/test-documents/testATOM.atom")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + ParseContext context = new ParseContext(); + + new FeedParser().parse(input, handler, metadata, context); + + String content = handler.toString(); + assertFalse(content == null); + + assertEquals("Sample Atom File for Junit test", + metadata.get(TikaCoreProperties.DESCRIPTION)); + assertEquals("Test Atom Feed", metadata.get(TikaCoreProperties.TITLE)); + + // TODO Check some more + } + } + +}
