svn commit: r1331794 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/ tika-parsers/src/main/java/org/apache/tika/parser/iptc/ tika-parsers/src/main/resources/META-INF/services/

nick Sat, 28 Apr 2012 09:54:00 -0700

Author: nick
Date: Sat Apr 28 16:53:35 2012
New Revision: 1331794

URL: http://svn.apache.org/viewvc?rev=1331794&view=rev
Log:
TIKA-858 Patch from Craig Stires to add support for parsing IPTC ANPA News Wire 
Feeds


Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java
Modified:
    
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1331794&r1=1331793&r2=1331794&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Sat Apr 28 16:53:35 2012
@@ -4137,6 +4137,14 @@
   <mime-type type="text/vnd.in3d.spot">
     <glob pattern="*.spot"/>
   </mime-type>
+  <mime-type type="text/vnd.iptc.anpa">
+    <acronym>ANPA</acronym>
+    <_comment>American Newspaper Publishers Association Wire Feeds</_comment>
+    <glob pattern="*.anpa"/>
+    <magic priority="50">
+      <match value="\x16\x16\x01" type="string" offset="0"/>
+    </magic>
+  </mime-type>
   <mime-type type="text/vnd.iptc.newsml"/>
   <mime-type type="text/vnd.iptc.nitf"/>
   <mime-type type="text/vnd.latex-z"/>

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java?rev=1331794&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java
 Sat Apr 28 16:53:35 2012
@@ -0,0 +1,821 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iptc;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+import java.util.HashMap;
+
+import java.util.Date;
+import java.util.TimeZone;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.text.ParseException;
+import java.nio.charset.Charset;
+import java.io.UnsupportedEncodingException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for IPTC ANPA New Wire Feeds
+ */
+public class IptcAnpaParser implements Parser {
+
+    private static final MediaType TYPE =
+        MediaType.text("vnd.iptc.anpa");
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.singleton(TYPE);
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+           InputStream stream, ContentHandler handler,
+           Metadata metadata, ParseContext context)
+           throws IOException, SAXException, TikaException {
+
+        HashMap<String,String> properties = this.loadProperties(stream);
+        this.setMetadata(metadata, properties);
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        // TODO: put body content here
+        xhtml.startElement("p");
+        String body = clean(properties.get("body"));
+        if (body != null)
+           xhtml.characters(body);
+        xhtml.endElement("p");
+        xhtml.endDocument();
+    }
+
+    /**
+     * @deprecated This method will be removed in Apache Tika 1.0.
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
+            throws IOException, SAXException, TikaException {
+        parse(stream, handler, metadata, new ParseContext());
+    }
+
+
+   private int FMT_ANPA_1312    = 0x00;   // "NAA 89-3 (ANPA 1312)"
+   private int FMT_ANPA_UPI     = 0x01;   // "United Press International ANPA 
1312 variant"
+   private int FMT_ANPA_UPI_DL  = 0x02;   // "United Press International 
Down-Load Message"
+   private int FMT_IPTC_7901    = 0x03;   // "IPTC7901 Recommended Message 
Format"
+   private int FMT_IPTC_PHOTO   = 0x04;   // "IPTC-NAA Digital Newsphoto 
Parameter Record"
+   private int FMT_IPTC_CHAR    = 0x05;   // "IPTC Unstructured Character 
Oriented File Format (UCOFF)"
+   private int FMT_NITF         = 0x06;   // "News Industry Text Format (NITF)"
+   private int FMT_NITF_TT      = 0x07;   // "Tidningarnas Telegrambyra NITF 
version (TTNITF DTD)"
+   private int FMT_NITF_RB      = 0x08;   // "Ritzaus Bureau NITF version 
(RBNITF DTD)"
+   private int FMT_IPTC_AP      = 0x09;   // "Associated Press news wire 
format"
+   private int FMT_IPTC_BLM     = 0x0A;   // "Bloomberg News news wire format"
+   private int FMT_IPTC_NYT     = 0x0B;   // "New York Times news wire format"
+   private int FMT_IPTC_RTR     = 0x0C;   // "Reuters news wire format"
+
+   private int FORMAT = FMT_ANPA_1312;    // assume the default format to be 
ANPA-1312
+
+   private final static char SOH = 0x01;    // start of header (ctrl-a)
+   private final static char STX = 0x02;    // start of text (ctrl-b)
+   private final static char ETX = 0x03;    // end of text (ctrl-c)
+   private final static char EOT = 0x04;    // the tab character (ctrl-d)
+   private final static char SYN = 0x16;    // synchronous idle (ctrl-v)
+
+   private final static char BS = 0x08;    // the backspace character (used 
for diacriticals)
+   private final static char TB = 0x09;    // the tab character
+   private final static char LF = 0x0A;    // line feed
+   private final static char FF = 0x0C;    // form feed
+   private final static char CR = 0x0D;    // carriage return
+   private final static char XQ = 0x11;    // device control (ctrl-q)
+   private final static char XS = 0x13;    // device control (ctrl-s)
+   private final static char FS = 0x1F;    // a field delimiter
+
+   private final static char HY = 0x2D;    // hyphen
+   private final static char SP = 0x20;    // the blank space
+   private final static char LT = 0x3C;    // less than
+   private final static char EQ = 0x3D;    // less than
+   private final static char CT = 0x5E;    // carat
+
+   private final static char SL = 0x91;    // single-quote left
+   private final static char SR = 0x92;    // single-quote right
+   private final static char DL = 0x93;    // double-quote left
+   private final static char DR = 0x94;    // double-quote right
+
+
+   /**
+    * scan the news messsage and store the metadata and data into a map
+    */
+   private HashMap<String,String> loadProperties(InputStream is) {
+      
+      HashMap<String,String> properties = new HashMap<String,String>();
+
+      FORMAT = this.scanFormat(is);
+
+      byte[] residual = this.getSection(is,"residual");
+
+      byte[] header = this.getSection(is,"header");
+      parseHeader(header, properties);
+
+      byte[] body = this.getSection(is,"body");
+      parseBody(body, properties);
+
+      byte[] footer = this.getSection(is,"footer");
+      parseFooter(footer, properties);
+       
+      return (properties);
+   }
+
+
+   private int scanFormat(InputStream is) {
+      int format    = this.FORMAT;
+      int  maxsize  = 524288;     //  512K
+
+      byte[] buf = new byte[maxsize];
+      try {
+         if (is.markSupported()) {
+            is.mark(maxsize);
+         }
+         int msgsize = is.read(buf);                // read in at least the 
full data
+
+         String message = (new String(buf)).toLowerCase();
+         // these are not if-then-else, because we want to go from most common
+         // and fall through to least.  this is imperfect, as these tags could
+         // show up in other agency stories, but i can't find a spec or any
+         // explicit codes to identify the wire source in the message itself
+
+         if (message.contains("ap-wf")) {
+            format = this.FMT_IPTC_AP;
+         }
+         if (message.contains("reuters")) {
+            format = this.FMT_IPTC_RTR;
+         }
+         if (message.contains("new york times")) {
+            format = this.FMT_IPTC_NYT;
+         }
+         if (message.contains("bloomberg news")) {
+            format = this.FMT_IPTC_BLM;
+         }
+      }
+      catch (IOException eio) {
+         // we are in an unstable state
+      }
+
+      try {
+         if (is.markSupported()) {
+            is.reset();
+         }
+      }
+      catch (IOException eio) {
+         // we are in an unstable state
+      }
+      return (format);
+   }
+
+
+   private void setFormat(int format) {
+      this.FORMAT = format;
+   }
+
+
+   private String getFormatName() {
+      
+      String name = "";
+      
+      if (FORMAT == this.FMT_IPTC_AP) {
+         name = "Associated Press";
+      }
+      
+      else if(FORMAT == this.FMT_IPTC_BLM) {
+         name = "Bloomberg";
+      }
+
+      else if(FORMAT == this.FMT_IPTC_NYT) {
+         name = "New York Times";
+      }
+
+      else if(FORMAT == this.FMT_IPTC_RTR) {
+         name = "Reuters";
+      }
+
+      return (name);
+   }
+
+
+   private byte[] getSection(InputStream is, String name) {
+
+      byte[] value = new byte[0];
+
+      if (name.equals("residual")) {
+         // the header shouldn't be more than 1k, but just being generous here
+         int  maxsize  = 8192;     //  8K
+         byte bstart   = SYN;     // check for SYN [0x16 : ctrl-v] (may have 
leftover residue from preceding message)
+         byte bfinish  = SOH;     // check for SOH [0x01 : ctrl-a] (typically 
follows a pair of SYN [0x16 : ctrl-v])
+         value = getSection(is, maxsize, bstart, bfinish, true);
+      }
+
+      else if(name.equals("header")) {
+         // the header shouldn't be more than 1k, but just being generous here
+         int  maxsize  = 8192;     //  8K
+         byte bstart   = SOH;     // check for SOH [0x01 : ctrl-a] (typically 
follows a pair of SYN [0x16 : ctrl-v])
+         byte bfinish  = STX;     // check for STX [0x02 : ctrl-b] (marks end 
of header, beginning of message)
+         value = getSection(is, maxsize, bstart, bfinish, true);
+      }
+
+      else if (name.equals("body")) {
+         // the message shouldn't be more than 16k (?), leaving plenty of space
+         int  maxsize  = 524288;     //  512K
+         byte bstart   = STX;     // check for STX [0x02 : ctrl-b] (marks end 
of header, beginning of message)
+         byte bfinish  = ETX;     // check for ETX [0x03 : ctrl-c] (marks end 
of message, beginning of footer)
+         value = getSection(is, maxsize, bstart, bfinish, true);
+      }
+
+      else if (name.equals("footer")) {
+         // the footer shouldn't be more than 1k , leaving plenty of space
+         int maxsize   = 8192;     //  8K
+         byte bstart   = ETX;     // check for ETX [0x03 : ctrl-c] (marks end 
of message, beginning of footer)
+         byte bfinish  = EOT;     // check for EOT [0x04 : ctrl-d] (marks end 
of transmission)
+         value = getSection(is, maxsize, bstart, bfinish, true);
+      }
+
+      return (value);
+   }
+
+
+   private byte[] getSection(InputStream is, int maxsize, byte bstart, byte 
bfinish, boolean ifincomplete) {
+      byte[] value  = new byte[0];
+
+      try {
+         boolean started = false;                   // check if we have found 
the start flag
+         boolean finished = false;                  // check if we have found 
the finish flag
+         int read = 0;                              // the number of bytes we 
read
+         int start = 0;                             // the position after the 
start flag
+
+         // TODO: this only pulls back 8K of data on a read, regardless of 
buffer size
+         //       more nefariously, it caps at a total 8K, through all sections
+         int streammax = is.available();
+         maxsize = Math.min(maxsize, streammax);
+
+         is.mark(maxsize);
+         byte[] buf = new byte[maxsize];
+         int totsize = 0;
+         int remainder = maxsize - totsize;
+         while (remainder > 0) {
+            int msgsize = is.read(buf, maxsize-remainder, maxsize);    // read 
in at least the full data
+            if (msgsize == -1) {
+               remainder = msgsize = 0;
+            }
+            remainder -= msgsize;
+            totsize   += msgsize;
+         }
+
+         // scan through the provided input stream
+         for (read=0; read < totsize; read++) {
+            byte b = buf[read];
+
+            if (!started) {
+               started = (b == bstart);
+               start = read + 1;
+               continue;
+            }
+
+            if (finished = (b == bfinish)) {
+/*
+               is.reset();
+               long skipped = is.skip((long)read);
+               if (skipped != read) {
+                  // we are in an unstable state
+               }
+               is.mark(1);
+ */
+               break;
+            }
+
+            // load from the stream until we run out of characters, or hit the 
termination byte
+            continue;
+         }
+
+         // move the input stream back to where it was initially
+         is.reset();
+
+         if (finished) {
+            // now, we want to reset the stream to be sitting right on top of 
the finish marker
+            is.skip(read);
+            value = new byte[read-start];
+            System.arraycopy(buf, start, value, 0, read-start);
+         }
+         else {
+            if (ifincomplete && started) {
+               // the caller wants anything that was read, and we finished the 
stream or buffer
+               value = new byte[read-start];
+               System.arraycopy(buf, start, value, 0, read-start);
+            }
+         }
+      }
+      catch (IOException eio) {
+         // something invalid occurred, return an empty string
+      }
+
+      return (value);
+   }
+
+
+   private boolean parseHeader(byte[] value, HashMap<String,String> 
properties) {
+      boolean added = false;
+
+      String env_serviceid = "";
+      String env_category = "";
+      String env_urgency = "";
+      String hdr_edcode = "";
+      String hdr_subject = "";
+      String hdr_date = "";
+      String hdr_time = "";
+
+      int read = 0;
+
+      while (read < value.length) {
+
+         // pull apart the envelope, getting the service id  (....\x1f)
+         while (read < value.length) {
+            byte val_next = value[read++];
+            if (val_next != FS) {
+               env_serviceid += (char)(val_next & 0xff);  // convert the byte 
to an unsigned int
+            }
+            else {
+               break;
+            }
+         }
+
+         // pull apart the envelope, getting the category  (....\x13\x11)
+         while (read < value.length) {
+            byte val_next = value[read++];
+            if (val_next != XS) {   // the end of the envelope is marked (\x13)
+               env_category += (char)(val_next & 0xff);  // convert the byte 
to an unsigned int
+            }
+            else {
+               val_next = value[read];  // get the remaining byte (\x11)
+               if (val_next == XQ) {
+                  read++;
+               }
+               break;
+            }
+         }
+
+         // pull apart the envelope, getting the subject heading
+         while (read < value.length) {
+            boolean subject = true;
+            byte val_next = value[read++];
+            while ((subject) && (val_next != SP) && (val_next != 0x00)) {  // 
ignore the envelope subject
+               hdr_subject += (char)(val_next & 0xff);  // convert the byte to 
an unsigned int
+               val_next =  (read < value.length) ? value[read++] : 0x00;
+               while (val_next == SP) {  // consume all the spaces
+                  subject = false;
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+                  if (val_next != SP) {
+                     --read;  // otherwise we eat into the next section
+                  }
+               }
+            }
+            if (!subject) {
+               break;
+            }
+         }
+
+         // pull apart the envelope, getting the date and time
+         while (read < value.length) {
+            byte val_next = value[read++];
+            if (hdr_date.isEmpty()) {
+               while (((val_next >= (byte)0x30) && (val_next <= (byte)0x39))  
// consume all numerics and hyphens
+                  ||   (val_next == HY)) {
+                  hdr_date += (char)(val_next & 0xff);  // convert the byte to 
an unsigned int
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+               }
+            }
+            else if (val_next == SP) {
+               while (val_next == SP) {  // consume all the spaces
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+               }
+               continue;
+            }
+            else {
+               while (((val_next >= (byte)0x30) && (val_next <= (byte)0x39))  
// consume all numerics and hyphens
+                  ||   (val_next == HY)) {
+                  hdr_time += (char)(val_next & 0xff);  // convert the byte to 
an unsigned int
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+               }
+            }
+         }
+         break; // don't let this run back through and start thrashing metadata
+      }
+
+      // if we were saving any of these values, we would set the properties 
map here
+
+      added = (!env_serviceid.isEmpty() || !env_category.isEmpty() || 
!hdr_subject.isEmpty() || !hdr_date.isEmpty() || !hdr_time.isEmpty());
+
+      return (added);
+   }
+
+
+   private boolean parseBody(byte[] value, HashMap<String,String> properties) {
+      boolean added = false;
+
+      String bdy_heading = "";
+      String bdy_title = "";
+      String bdy_source = "";
+      String bdy_author = "";
+      String bdy_body = "";
+
+      int read = 0;
+      boolean done = false;
+
+      while (!done && (read < value.length)) {
+
+         // pull apart the body, getting the heading (^....\x0d\x0a)
+         while (read < value.length) {
+            byte val_next = value[read++];
+            if (val_next == CT) {      //  start of a new section , first is 
the heading
+               val_next =  (read < value.length) ? value[read++] : 0x00;
+               // AP, NYT, and Bloomberg end with < , Reuters with EOL
+               while ((val_next != LT) && (val_next != CR) && (val_next != 
LF)) {   // less than delimiter (\x3c) and not EOL
+                  bdy_heading += (char)(val_next & 0xff);  // convert the byte 
to an unsigned int
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+                  if (read > value.length) { break; }  // shouldn't ever hit 
this, but save a NPE
+               }
+               if (val_next == LT) {
+                  // hit the delimiter, carry on
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+               }
+               while (!bdy_heading.isEmpty() && ((val_next == CR) || (val_next 
== LF))) {
+                  val_next =  (read < value.length) ? value[read++] : 0x00;  
// skip the new lines
+                  if ((val_next != CR) && (val_next != LF)) {
+                     --read;
+                  }
+               }
+            }
+            else {
+               // this will only be hit on poorly-formed files
+
+               // for reuters, the heading does not start with the ^, so we 
push one back into the stream
+               if (FORMAT == this.FMT_IPTC_RTR) {
+                  if (val_next != CT) {
+                     // for any non-whitespace, we need to go back an 
additional step to non destroy the data
+                     if ((val_next != SP) && (val_next != TB) && (val_next != 
CR) && (val_next != LF)) {
+                        // if the very first byte is data, we have to shift 
the whole array, and stuff in a carat
+                        if (read == 1) {
+                           byte[] resize = new byte[value.length + 1];
+                           System.arraycopy(value, 0, resize, 1, value.length);
+                           value = resize;
+                        }
+                     }
+                     value[--read] = CT;
+                     continue;
+                  }
+               }
+            }
+            break;
+         }
+
+         // pull apart the body, getting the title (^....\x0d\x0a)
+         while (read < value.length) {
+            byte val_next = value[read++];
+            if (val_next == CT) {      //  start of a new section , first is 
the heading
+               val_next =  (read < value.length) ? value[read++] : 0x00;
+               // AP, NYT, and Bloomberg end with < , Reuters with EOL
+               while ((val_next != LT) && (val_next != CT) && (val_next != CR) 
&& (val_next != LF)) {   // less than delimiter (\x3c), or carat (\x5e) and not 
EOL
+                  bdy_title += (char)(val_next & 0xff);  // convert the byte 
to an unsigned int
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+                  if (read > value.length) { break; }  // shouldn't ever hit 
this, but save a NPE
+               }
+
+               if (val_next == CT) {      //  start of a new section , when 
first didn't finish cleanly
+                   --read;
+               }
+
+               if (val_next == LT) {
+                  // hit the delimiter, carry on
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+               }
+
+               while (!bdy_title.isEmpty() && ((val_next == CR) || (val_next 
== LF))) {
+                  val_next =  (read < value.length) ? value[read++] : 0x00;  
// skip the new lines
+                  if ((val_next != CR) && (val_next != LF)) {
+                     --read;
+                  }
+               }
+            }
+            else {
+               // this will only be hit on poorly-formed files
+
+               // for bloomberg, the title does not start with the ^, so we 
push one back into the stream
+               if (FORMAT == this.FMT_IPTC_BLM) {
+                  if (val_next == TB) {
+                     value[--read] = CT;
+                     continue;
+                  }
+               }
+
+               // for reuters, the title does not start with the ^, so we push 
one back into the stream
+               if (FORMAT == this.FMT_IPTC_RTR) {
+                  if (val_next != CT) {
+                     // for any non-whitespace, we need to go back an 
additional step to non destroy the data
+                     if ((val_next != SP) && (val_next != TB) && (val_next != 
CR) && (val_next != LF)) {
+                        --read;
+                     }
+                     value[--read] = CT;
+                     continue;
+                  }
+               }
+            }
+            break;
+         }
+
+
+         // at this point, we have a variable number of metadata lines, with 
various orders
+         // we scan the start of each line for the special character, and run 
to the end character
+         // pull apart the body, getting the title (^....\x0d\x0a)
+         boolean metastarted = false;
+         String longline = "";
+         String longkey = "";
+         while (read < value.length) {
+            byte val_next = value[read++];
+
+            // eat up whitespace before committing to the next section
+            if ((val_next == SP) || (val_next == TB) || (val_next == CR) || 
(val_next == LF)) {
+               continue;
+            }
+
+            if (val_next == CT) {      //  start of a new section , could be 
authors, sources, etc
+               val_next =  (read < value.length) ? value[read++] : 0x00;
+               String tmp_line = "";
+               while ((val_next != LT) && (val_next != CT) && (val_next != CR) 
&& (val_next != LF) && (val_next != 0))  {
+                  // less than delimiter (\x3c), maybe also badly formed with 
just new line
+                  tmp_line += (char)(val_next & 0xff);  // convert the byte to 
an unsigned int
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+                  if (read > value.length) { break; }  // shouldn't ever hit 
this, but save a NPE
+               }
+
+               if (val_next == CT) {      //  start of a new section , when 
first didn't finish cleanly
+                   --read;
+               }
+
+               if (val_next == LT) {
+                  // hit the delimiter, carry on
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+               }
+
+               while ((val_next == CR) || (val_next == LF)) {
+                  val_next =  (read < value.length) ? value[read++] : 0x00;  
// skip the new lines
+                  if ((val_next != CR) && (val_next != LF)) {
+                     --read;
+                  }
+               }
+               if (tmp_line.toLowerCase().startsWith("by") || 
longline.equals("bdy_author")) {
+                  longkey = "bdy_author";
+
+                  // prepend a space to subsequent line, so it gets parsed 
consistent with the lead line
+                  tmp_line = (longline.equals(longkey) ? " " : "") + tmp_line;
+
+                  // we have an author candidate
+                  int term = tmp_line.length();
+                  term = Math.min(term, (tmp_line.indexOf("<")  > -1 ? 
tmp_line.indexOf("<")  : term));
+                  term = Math.min(term, (tmp_line.indexOf("=")  > -1 ? 
tmp_line.indexOf("=")  : term));
+                  term = Math.min(term, (tmp_line.indexOf("\n") > -1 ? 
tmp_line.indexOf("\n") : term));
+                  term = (term > 0 ) ? term : tmp_line.length();
+                  bdy_author += tmp_line.substring(tmp_line.indexOf(" "), 
term);
+                  metastarted = true;
+                  longline = ((tmp_line.indexOf("=")  > -1) && 
(!longline.equals(longkey)) ? longkey : "");
+               }
+               else if (FORMAT == this.FMT_IPTC_BLM) {
+                  String byline = "   by ";
+                  if (tmp_line.toLowerCase().contains(byline)) {
+                     longkey = "bdy_author";
+
+                     int term = tmp_line.length();
+                     term = Math.min(term, (tmp_line.indexOf("<")  > -1 ? 
tmp_line.indexOf("<")  : term));
+                     term = Math.min(term, (tmp_line.indexOf("=")  > -1 ? 
tmp_line.indexOf("=")  : term));
+                     term = Math.min(term, (tmp_line.indexOf("\n") > -1 ? 
tmp_line.indexOf("\n") : term));
+                     term = (term > 0 ) ? term : tmp_line.length();
+                     // for bloomberg, the author line sits below their 
copyright statement
+                     bdy_author += 
tmp_line.substring(tmp_line.toLowerCase().indexOf(byline) + byline.length(), 
term) + " ";
+                     metastarted = true;
+                     longline = ((tmp_line.indexOf("=")  > -1) && 
(!longline.equals(longkey)) ? longkey : "");
+                  }
+                  else if(tmp_line.toLowerCase().startsWith("c.")) {
+                     // the author line for bloomberg is a multiline starting 
with c.2011 Bloomberg News
+                     // then containing the author info on the next line
+                     if (val_next == TB) {
+                        value[--read] = CT;
+                        continue;
+                     }
+                  }
+                  else if(tmp_line.toLowerCase().trim().startsWith("(") && 
tmp_line.toLowerCase().trim().endsWith(")")) {
+                     // the author line may have one or more comment lines 
between the copyright
+                     // statement, and the By AUTHORNAME line
+                     if (val_next == TB) {
+                        value[--read] = CT;
+                        continue;
+                     }
+                  }
+               }
+
+               else if (tmp_line.toLowerCase().startsWith("eds") || 
longline.equals("bdy_source")) {
+                  longkey = "bdy_source";
+                  // prepend a space to subsequent line, so it gets parsed 
consistent with the lead line
+                  tmp_line = (longline.equals(longkey) ? " " : "") + tmp_line;
+
+                  // we have a source candidate
+                  int term = tmp_line.length();
+                  term = Math.min(term, (tmp_line.indexOf("<")  > -1 ? 
tmp_line.indexOf("<")  : term));
+                  term = Math.min(term, (tmp_line.indexOf("=")  > -1 ? 
tmp_line.indexOf("=")  : term));
+//                  term = Math.min(term, (tmp_line.indexOf("\n") > -1 ? 
tmp_line.indexOf("\n") : term));
+                  term = (term > 0 ) ? term : tmp_line.length();
+                  bdy_source += tmp_line.substring(tmp_line.indexOf(" ") + 1, 
term) + " ";
+                  metastarted = true;
+                  longline = (!longline.equals(longkey) ? longkey  : "");
+               }
+               else {
+                  // this has fallen all the way through.  trap it as part of 
the subject,
+                  // rather than just losing it
+                  if (!metastarted) {
+                     bdy_title += " , " + tmp_line;     //  not sure where 
else to put this but in the title
+                  }
+                  else {
+                     // what to do with stuff that is metadata, which falls 
after metadata lines started?
+                     bdy_body += " " + tmp_line + " , ";     //  not sure 
where else to put this but in the title
+                  }
+               }
+            }
+            else {  // we're on to the main body
+               while ((read < value.length) && (val_next != 0))  {
+                  // read until the train runs out of tracks
+                  bdy_body += (char)(val_next & 0xff);  // convert the byte to 
an unsigned int
+                  val_next =  (read < value.length) ? value[read++] : 0x00;
+                  if (read > value.length) { break; }  // shouldn't ever hit 
this, but save a NPE
+               }
+
+            }
+            // we would normally break here, but just let this read out to the 
end
+         }
+         done = true; // don't let this run back through and start thrashing 
metadata
+      }
+      properties.put("body", bdy_body);
+      properties.put("title", bdy_title);
+      properties.put("subject", bdy_heading);
+      properties.put("author", bdy_author);
+      properties.put("source", bdy_source);
+
+      added = (!bdy_body.isEmpty() || !bdy_title.isEmpty() || 
!bdy_heading.isEmpty() || !bdy_author.isEmpty() || !bdy_source.isEmpty());
+
+      return (added);
+   }
+
+
+   private boolean parseFooter(byte[] value, HashMap<String,String> 
properties) {
+      boolean added = false;
+
+      String ftr_source = "";
+      String ftr_datetime = "";
+
+      int read = 0;
+      boolean done = false;
+
+      while (!done && (read < value.length)) {
+
+         // pull apart the footer, getting the news feed source (^....\x0d\x0a)
+         byte val_next = value[read++];
+         byte val_peek =  (read < value.length) ? value[read+1] : 0x00;  // 
skip the new lines
+
+         while (((val_next < (byte)0x30) || (val_next > (byte)0x39)) && 
(val_next != 0)) {  // consume all non-numerics first
+            ftr_source += (char)(val_next & 0xff);  // convert the byte to an 
unsigned int
+            val_next =  (read < value.length) ? value[read] : 0x00;  // 
attempt to read until end of stream
+            read++;
+            if (read > value.length) { break; }  // shouldn't ever hit this, 
but save a NPE
+         }
+
+         while ((val_next != LT) && (val_next != CR) && (val_next != LF) && 
(val_next != 0))  {  // get as much timedate as possible
+            // this is an american format, so arrives as mm-dd-yy HHiizzz
+            ftr_datetime += (char)(val_next & 0xff);  // convert the byte to 
an unsigned int
+            val_next =  (read < value.length) ? value[read++] : 0x00;  // skip 
the new lines
+            if (read > value.length) { break; }  // shouldn't ever hit this, 
but save a NPE
+         }
+         if (val_next == LT) {
+            // hit the delimiter, carry on
+            val_next =  (read < value.length) ? value[read++] : 0x00;
+         }
+
+         if (!ftr_datetime.isEmpty()) {
+            // we want to pass this back in a more friendly format
+            String format_out = "yyyy-MM-dd'T'HH:mm:ss'Z'";
+            Date dateunix = new Date();
+            try {
+               // standard ap format
+               String format_in = "MM-dd-yy HHmmzzz";
+
+               if (FORMAT == this.FMT_IPTC_RTR) {
+                  // standard reuters format
+                  format_in = "HH:mm MM-dd-yy";
+               }
+               SimpleDateFormat dfi =   new SimpleDateFormat(format_in);
+               dfi.setTimeZone(TimeZone.getTimeZone("UTC"));
+               dateunix = dfi.parse(ftr_datetime);
+            }
+            catch (ParseException ep) {
+               // failed, but this will just fall through to setting the date 
to now
+            }
+            SimpleDateFormat dfo =   new SimpleDateFormat(format_out);
+            dfo.setTimeZone(TimeZone.getTimeZone("UTC"));
+            ftr_datetime = dfo.format(dateunix);
+         }
+         while ((val_next == CR) || (val_next == LF)) {
+            val_next =  (read < value.length) ? value[read++] : 0x00;  // skip 
the new lines
+            if ((val_next != CR) && (val_next != LF)) {
+               --read;
+            }
+         }
+         done = true; // don't let this run back through and start thrashing 
metadata
+      }
+
+      properties.put("publisher", ftr_source);
+      properties.put("created", ftr_datetime);
+      properties.put("modified", ftr_datetime);
+
+      added = (!ftr_source.isEmpty() || !ftr_datetime.isEmpty());
+
+      return (added);
+   }
+
+
+   private void setMetadata(Metadata metadata, HashMap<String,String> 
properties) {
+
+      // every property that gets set must be non-null, or it will cause NPE
+      // in other consuming applications, like Lucene
+      metadata.set(Metadata.CONTENT_TYPE,  clean("text/anpa-1312"));
+      metadata.set(Metadata.TITLE,         clean(properties.get("title")));
+      metadata.set(Metadata.SUBJECT,       clean(properties.get("subject")));
+      metadata.set(Metadata.AUTHOR,        clean(properties.get("author")));
+      metadata.set(Metadata.CREATION_DATE, clean(properties.get("created")));
+      metadata.set(Metadata.MODIFIED,      clean(properties.get("modified")));
+      metadata.set(DublinCore.SOURCE,      clean(properties.get("source")));
+//      metadata.set(Metadata.PUBLISHER,     
clean(properties.get("publisher")));
+      metadata.set(Metadata.PUBLISHER,     clean(this.getFormatName()));
+
+
+/*
+        metadata.set(DublinCore.DATE, font.getHeader().getCreated().getTime());
+        metadata.set(
+                Property.internalDate(DublinCore.MODIFIED),
+                font.getHeader().getModified().getTime());
+*/
+
+   }
+
+
+   private String clean(String value) {
+
+      if (value == null) {
+         value = "";
+      }
+
+      try {
+         String tmpvalue = new 
String(value.getBytes(Charset.forName("UTF-8")),"UTF-8");
+         value = tmpvalue;
+      }
+      catch (UnsupportedEncodingException eue) {}
+      value = value.replaceAll("``", "`");
+      value = value.replaceAll("''", "'");
+      value = value.replaceAll(new String(new char[] {SL}), "'");
+      value = value.replaceAll(new String(new char[] {SR}), "'");
+      value = value.replaceAll(new String(new char[] {DL}), "\"");
+      value = value.replaceAll(new String(new char[] {DR}), "\"");
+      value = value.trim();
+
+      return (value);
+   }
+   
+}

Modified: 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1331794&r1=1331793&r2=1331794&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 (original)
+++ 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 Sat Apr 28 16:53:35 2012
@@ -25,6 +25,7 @@ org.apache.tika.parser.html.HtmlParser
 org.apache.tika.parser.image.ImageParser
 org.apache.tika.parser.image.PSDParser
 org.apache.tika.parser.image.TiffParser
+org.apache.tika.parser.iptc.IptcAnpaParser
 org.apache.tika.parser.iwork.IWorkPackageParser
 org.apache.tika.parser.jpeg.JpegParser
 org.apache.tika.parser.mail.RFC822Parser

svn commit: r1331794 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/ tika-parsers/src/main/java/org/apache/tika/parser/iptc/ tika-parsers/src/main/resources/META-INF/services/

Reply via email to