Author: jukka
Date: Tue Dec  2 15:13:05 2008
New Revision: 722663

URL: http://svn.apache.org/viewvc?rev=722663&view=rev
Log:
TIKA-172: New Open Document Parser that emits structured XHTML content

Patch by Uwe Schindler.

Added:
    
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/NSNormalizerContentHandler.java
    
lucene/tika/trunk/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java
Modified:
    
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java
    
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java
    lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml

Added: 
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/NSNormalizerContentHandler.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/NSNormalizerContentHandler.java?rev=722663&view=auto
==============================================================================
--- 
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/NSNormalizerContentHandler.java
 (added)
+++ 
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/NSNormalizerContentHandler.java
 Tue Dec  2 15:13:05 2008
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.opendocument;
+
+import java.util.Map;
+import javax.xml.namespace.QName;
+import java.io.StringReader;
+import java.io.IOException;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+
+import org.xml.sax.SAXException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that:<ul>
+ * <li>Maps old OpenOffice 1.0 Namespaces to the OpenDocument ones</li>
+ * <li>Returns a fake DTD when parser requests OpenOffice DTD</li>
+ * </ul>
+ */
+public class NSNormalizerContentHandler extends ContentHandlerDecorator {
+
+       public NSNormalizerContentHandler(ContentHandler handler) {
+               super(handler);
+       }
+
+       private final String mapOldNS(String ns) {
+               if (ns==null) return null;
+               if (ns.startsWith("http://openoffice.org/2000/";))
+                       
ns="urn:oasis:names:tc:opendocument:xmlns:"+ns.substring(27)+":1.0";
+               return ns;
+       }
+       
+       @Override
+       public void startElement(String namespaceURI, String localName, String 
qName, Attributes atts) throws SAXException {
+               AttributesImpl natts = new AttributesImpl();
+               for (int i = 0; i < atts.getLength(); i++) {
+                       natts.addAttribute(
+                               mapOldNS(atts.getURI(i)), atts.getLocalName(i), 
atts.getQName(i),
+                               atts.getType(i), atts.getValue(i)
+                       );
+               }
+               super.startElement(mapOldNS(namespaceURI),localName,qName,atts);
+       }
+       
+       @Override
+       public void endElement(String namespaceURI, String localName, String 
qName) throws SAXException {
+               super.endElement(mapOldNS(namespaceURI),localName,qName);
+       }
+       
+       @Override
+       public void startPrefixMapping(String prefix, String uri) throws 
SAXException {
+               super.startPrefixMapping(prefix,mapOldNS(uri));
+       }
+
+       /** do not load any DTDs (may be requested by parser). Fake the DTD by 
returning a empty string as InputSource */
+       @Override
+       public InputSource resolveEntity(String publicId, String systemId) 
throws IOException,SAXException {
+               if (
+                       "-//OpenOffice.org//DTD OfficeDocument 
1.0//EN".equals(publicId) ||
+                       (systemId!=null && 
systemId.toLowerCase().endsWith(".dtd"))
+               ) {
+                       return new InputSource(new StringReader(""));
+               } else {
+                       return super.resolveEntity(publicId,systemId);
+               }
+       }
+       
+}

Modified: 
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java?rev=722663&r1=722662&r2=722663&view=diff
==============================================================================
--- 
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java
 (original)
+++ 
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java
 Tue Dec  2 15:13:05 2008
@@ -16,25 +16,177 @@
  */
 package org.apache.tika.parser.opendocument;
 
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Stack;
+import java.util.HashMap;
+import java.util.Collections;
+import java.util.BitSet;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+import javax.xml.namespace.QName;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.xml.XMLParser;
-import org.apache.tika.sax.xpath.MatchingContentHandler;
-import org.apache.tika.sax.xpath.XPathParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.ElementMappingContentHandler;
+import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
+import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
+import org.xml.sax.Attributes;
+
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
 
 /**
  * Parser for OpenDocument <code>content.xml</code> files.
  */
-public class OpenOfficeContentParser extends XMLParser {
+public class OpenOfficeContentParser implements Parser {
+
+       public static final String 
TEXT_NS="urn:oasis:names:tc:opendocument:xmlns:text:1.0";
+       public static final String 
TABLE_NS="urn:oasis:names:tc:opendocument:xmlns:table:1.0";
+       public static final String XLINK_NS="http://www.w3.org/1999/xlink";;
+       
+       protected static final char[] TAB=new char[]{'\t'};
 
-    private static final XPathParser OFFICE_XPATH = new XPathParser(
-            "office", "urn:oasis:names:tc:opendocument:xmlns:office:1.0");
+       /**
+        * Mappings between OpenDocument tag names and XHTML tag names 
(including attributes).
+        * All other tag names/attributes are ignored and left out from event 
stream. 
+        */
+       private static final HashMap<QName,TargetElement> MAPPINGS=new 
HashMap<QName,TargetElement>();
+       static {
+               // general mappings of text:-tags
+               MAPPINGS.put(new QName(TEXT_NS,"p"), new 
TargetElement(XHTML,"p"));
+               // text:h-tags are mapped specifically in 
startElement/endElement
+               MAPPINGS.put(new QName(TEXT_NS,"line-break"), new 
TargetElement(XHTML,"br"));
+               MAPPINGS.put(new QName(TEXT_NS,"list"), new 
TargetElement(XHTML,"ul"));
+               MAPPINGS.put(new QName(TEXT_NS,"list-item"), new 
TargetElement(XHTML,"li"));
+               MAPPINGS.put(new QName(TEXT_NS,"note"), new 
TargetElement(XHTML,"div"));
+               MAPPINGS.put(new QName(TEXT_NS,"span"), new 
TargetElement(XHTML,"span"));
+               MAPPINGS.put(new QName(TEXT_NS,"a"),new TargetElement(XHTML,"a",
+                       Collections.singletonMap(new QName(XLINK_NS,"href"), 
new QName("href"))
+               ));
+               
+               // create HTML tables from table:-tags
+               MAPPINGS.put(new QName(TABLE_NS,"table"), new 
TargetElement(XHTML,"table"));
+               // repeating of rows is ignored; for columns, see below!
+               MAPPINGS.put(new QName(TABLE_NS,"table-row"), new 
TargetElement(XHTML,"tr"));
+               // special mapping for rowspan/colspan attributes
+               final HashMap<QName,QName> tableCellAttsMapping=new 
HashMap<QName,QName>();
+               tableCellAttsMapping.put(new 
QName(TABLE_NS,"number-columns-spanned"),new QName("colspan"));
+               tableCellAttsMapping.put(new 
QName(TABLE_NS,"number-rows-spanned"),new QName("rowspan"));
+               /* TODO: The following is not correct, the cell should be 
repeated not spanned!
+                * Code generates a HTML cell, spanning all repeated columns, 
to make the cell look correct.
+                * Problems may occur when both spanning and repeating is 
given, which is not allowed by spec.
+                * Cell spanning instead of repeating  is not a problem, 
because OpenOffice uses it
+                * only for empty cells.
+                */
+               tableCellAttsMapping.put(new 
QName(TABLE_NS,"number-columns-repeated"),new QName("colspan"));
+               MAPPINGS.put(new QName(TABLE_NS,"table-cell"), new 
TargetElement(XHTML,"td",tableCellAttsMapping));
+       }
+               
+       public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata)
+               throws IOException, SAXException, TikaException {
 
-    protected DefaultHandler getDefaultHandler(ContentHandler ch, Metadata md) 
{
-        return new MatchingContentHandler(
-                super.getDefaultHandler(ch, md),
-                OFFICE_XPATH.parse("//office:body//text()"));
-    }
+               final XHTMLContentHandler xhtml = new 
XHTMLContentHandler(handler,metadata);
+               final DefaultHandler dh = new 
ElementMappingContentHandler(xhtml, MAPPINGS) {
+                       private final BitSet textNodeStack=new BitSet();
+                       private int nodeDepth=0,completelyFiltered=0;
+                       private Stack<String> headingStack=new Stack<String>();
+                       
+                       @Override
+                       public void characters(char[] ch, int start, int 
length) throws SAXException {
+                               // only forward content of tags from 
text:-namespace
+                               if (completelyFiltered==0 && nodeDepth>0 && 
textNodeStack.get(nodeDepth-1))
+                                       super.characters(ch,start,length);
+                       }
+                       
+                       // helper for checking tags which need complete 
filtering (with sub-tags)
+                       private final boolean needsCompleteFiltering(String 
namespaceURI, String localName) {
+                               return (
+                                       (TEXT_NS.equals(namespaceURI) && 
(localName.endsWith("-template") || localName.endsWith("-style"))) ||
+                                       (TABLE_NS.equals(namespaceURI) && 
"covered-table-cell".equals(localName))
+                               );
+                       }
+                       
+                       // map the heading level to <hX> HTML tags
+                       private final String getXHTMLHeaderTagName(Attributes 
atts) {
+                               final String 
depthStr=atts.getValue(TEXT_NS,"outline-level");
+                               if (depthStr==null) return "h1";
+                               int depth=Integer.parseInt(depthStr);
+                               if (depth>6) depth=6;
+                               if (depth<1) depth=1;
+                               return "h"+depth;
+                       }
+                       
+                       @Override
+                       public void startElement(String namespaceURI, String 
localName, String qName, Attributes atts) throws SAXException {
+                               // keep track of current node type. If it is a 
text node, a bit at the current depth ist set in textNodeStack.
+                               // characters() checks the top bit to 
determine, if the actual node is a text node to print out
+                               // nodeDepth contains the depth of the current 
node and also marks top of stack.
+                               assert nodeDepth>=0;
+                               textNodeStack.set(nodeDepth++, 
TEXT_NS.equals(namespaceURI));
+                               // filter *all* content of some tags
+                               assert completelyFiltered>=0;
+                               if 
(needsCompleteFiltering(namespaceURI,localName)) completelyFiltered++;
+                               // call next handler if no filtering
+                               if (completelyFiltered==0) {
+                                       // special handling of text:h, that are 
directly passed to xhtml handler
+                                       if (TEXT_NS.equals(namespaceURI) && 
"h".equals(localName)) {
+                                               
xhtml.startElement(headingStack.push(getXHTMLHeaderTagName(atts)));
+                                       } else {
+                                               
super.startElement(namespaceURI,localName,qName,atts);
+                                       }
+                               }
+                       }
+                       
+                       @Override
+                       public void endElement(String namespaceURI, String 
localName, String qName) throws SAXException {
+                               // call next handler if no filtering
+                               if (completelyFiltered==0) {
+                                       // special handling of text:h, that are 
directly passed to xhtml handler
+                                       if (TEXT_NS.equals(namespaceURI) && 
"h".equals(localName)) {
+                                               
xhtml.endElement(headingStack.pop());
+                                       } else {
+                                               
super.endElement(namespaceURI,localName,qName);
+                                       }
+                                       // special handling of tabulators
+                                       if (TEXT_NS.equals(namespaceURI) && 
("tab-stop".equals(localName) || "tab".equals(localName)))
+                                               
this.characters(TAB,0,TAB.length);
+                               }
+                               // revert filter for *all* content of some tags
+                               if 
(needsCompleteFiltering(namespaceURI,localName)) completelyFiltered--;
+                               assert completelyFiltered>=0;
+                               // reduce current node depth
+                               nodeDepth--;
+                               assert nodeDepth>=0;
+                       }
+                       
+                       @Override
+                       public void startPrefixMapping(String prefix, String 
uri) throws SAXException {
+                               // remove prefix mappings as they should not 
occur in XHTML
+                       }
+                       
+                       @Override
+                       public void endPrefixMapping(String prefix) throws 
SAXException {
+                               // remove prefix mappings as they should not 
occur in XHTML
+                       }
+                       
+               };
+               
+               try {
+                       SAXParserFactory factory = 
SAXParserFactory.newInstance();
+                       factory.setValidating(false);
+                       factory.setNamespaceAware(true);
+                       SAXParser parser = factory.newSAXParser();
+                       parser.parse(new CloseShieldInputStream(stream),new 
NSNormalizerContentHandler(dh));
+               } catch (ParserConfigurationException e) {
+                       throw new TikaException("XML parser configuration 
error", e);
+               }
+       }
 
 }

Modified: 
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java?rev=722663&r1=722662&r2=722663&view=diff
==============================================================================
--- 
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java
 (original)
+++ 
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java
 Tue Dec  2 15:13:05 2008
@@ -65,6 +65,7 @@
         dh = getStatistic(dh, md, "nbPara", "paragraph-count");
         dh = getStatistic(dh, md, "nbWord", "word-count");
         dh = getStatistic(dh, md, "nbCharacter", "character-count");
+               dh = new NSNormalizerContentHandler(dh);
         return dh;
     }
 

Added: 
lucene/tika/trunk/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java?rev=722663&view=auto
==============================================================================
--- 
lucene/tika/trunk/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java
 (added)
+++ 
lucene/tika/trunk/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java
 Tue Dec  2 15:13:05 2008
@@ -0,0 +1,115 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.util.Map;
+import java.util.Collections;
+import javax.xml.namespace.QName;
+
+import org.xml.sax.SAXException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that maps element <code>QName</code>s using a 
<code>Map</code>.
+ * Not mappable elements are not forwarded.
+ * Attributes may also be mapped (for each element different using a 
<code>Map</code> for attributes),
+ * not mappable attributes are not forwarded.
+ * The default is to not map any attributes and therefore do not forward any 
of them.
+ */
+public class ElementMappingContentHandler extends ContentHandlerDecorator {
+
+       private final Map<QName,TargetElement> mappings;
+
+       public ElementMappingContentHandler(ContentHandler handler, 
Map<QName,TargetElement> mappings) {
+               super(handler);
+               this.mappings=mappings;
+       }
+
+       @Override
+       public void startElement(String namespaceURI, String localName, String 
qName, Attributes atts) throws SAXException {
+               final TargetElement mapping=mappings.get(new 
QName(namespaceURI,localName));
+               if (mapping!=null) {
+                       final QName tag=mapping.getMappedTagName();
+                       
super.startElement(tag.getNamespaceURI(),tag.getLocalPart(),getQNameAsString(tag),mapping.mapAttributes(atts));
+               }
+       }
+
+       @Override
+       public void endElement(String namespaceURI, String localName, String 
qName) throws SAXException {
+               final TargetElement mapping=mappings.get(new 
QName(namespaceURI,localName));
+               if (mapping!=null) {
+                       final QName tag=mapping.getMappedTagName();
+                       
super.endElement(tag.getNamespaceURI(),tag.getLocalPart(),getQNameAsString(tag));
+               }
+       }
+       
+       protected static final String getQNameAsString(final QName qname) {
+               final StringBuilder qn=new StringBuilder(qname.getPrefix());
+               if (qn.length()>0) qn.append(':');
+               return qn.append(qname.getLocalPart()).toString();
+       }
+
+       public static class TargetElement {
+       
+               /** Creates an TargetElement, attributes of this element will 
be mapped as specified */
+               public TargetElement(QName mappedTagName, Map<QName,QName> 
attributesMapping) {
+                       this.mappedTagName=mappedTagName;
+                       this.attributesMapping=attributesMapping;
+               }
+               
+               /** A shortcut that automatically creates the QName object */
+               public TargetElement(String mappedTagURI, String 
mappedTagLocalName, Map<QName,QName> attributesMapping) {
+                       this(new QName(mappedTagURI,mappedTagLocalName), 
attributesMapping);
+               }
+               
+               /** Creates an TargetElement with no attributes, all attributes 
will be deleted from SAX stream */
+               public TargetElement(QName mappedTagName) {
+                       this(mappedTagName, 
Collections.<QName,QName>emptyMap());
+               }
+               
+               /** A shortcut that automatically creates the QName object */
+               public TargetElement(String mappedTagURI, String 
mappedTagLocalName) {
+                       this(mappedTagURI, mappedTagLocalName, 
Collections.<QName,QName>emptyMap());
+               }
+               
+               public QName getMappedTagName() {
+                       return mappedTagName;
+               }
+               
+               public Map<QName,QName> getAttributesMapping() {
+                       return attributesMapping;
+               }
+               
+               public Attributes mapAttributes(final Attributes atts) {
+                       final AttributesImpl natts = new AttributesImpl();
+                       for (int i = 0; i < atts.getLength(); i++) {
+                               QName name=attributesMapping.get(new 
QName(atts.getURI(i), atts.getLocalName(i)));
+                               if (name!=null) natts.addAttribute(
+                                       name.getNamespaceURI(), 
name.getLocalPart(), getQNameAsString(name),
+                                       atts.getType(i), atts.getValue(i)
+                               );
+                       }
+                       return natts;
+               }
+               
+               private final QName mappedTagName;
+               private final Map<QName,QName> attributesMapping;
+       }
+       
+}

Modified: lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=722663&r1=722662&r2=722663&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Tue Dec  2 
15:13:05 2008
@@ -387,6 +387,20 @@
     </magic>
   </mime-type>
 
+  <mime-type type="application/vnd.sun.xml.writer">
+    <comment>
+      OpenOffice v1.0: Writer Document
+    </comment>
+    <alias type="application/x-vnd.sun.xml.writer" />
+    <glob pattern="*.sxw" />
+    <magic>
+      <match type="string" offset="0" value="PK">
+        <match type="string" offset="30"
+          value="mimetypeapplication/vnd.sun.xml.writer" />
+      </match>
+    </magic>
+  </mime-type>
+
   <mime-type type="application/zip">
     <alias type="application/x-zip-compressed" />
     <magic priority="40">


Reply via email to