Author: jukka
Date: Tue Dec 2 15:13:05 2008
New Revision: 722663
URL: http://svn.apache.org/viewvc?rev=722663&view=rev
Log:
TIKA-172: New Open Document Parser that emits structured XHTML content
Patch by Uwe Schindler.
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/NSNormalizerContentHandler.java
lucene/tika/trunk/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java
lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/NSNormalizerContentHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/NSNormalizerContentHandler.java?rev=722663&view=auto
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/NSNormalizerContentHandler.java
(added)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/NSNormalizerContentHandler.java
Tue Dec 2 15:13:05 2008
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.opendocument;
+
+import java.util.Map;
+import javax.xml.namespace.QName;
+import java.io.StringReader;
+import java.io.IOException;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+
+import org.xml.sax.SAXException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that:<ul>
+ * <li>Maps old OpenOffice 1.0 Namespaces to the OpenDocument ones</li>
+ * <li>Returns a fake DTD when parser requests OpenOffice DTD</li>
+ * </ul>
+ */
+public class NSNormalizerContentHandler extends ContentHandlerDecorator {
+
+ public NSNormalizerContentHandler(ContentHandler handler) {
+ super(handler);
+ }
+
+ private final String mapOldNS(String ns) {
+ if (ns==null) return null;
+ if (ns.startsWith("http://openoffice.org/2000/"))
+
ns="urn:oasis:names:tc:opendocument:xmlns:"+ns.substring(27)+":1.0";
+ return ns;
+ }
+
+ @Override
+ public void startElement(String namespaceURI, String localName, String
qName, Attributes atts) throws SAXException {
+ AttributesImpl natts = new AttributesImpl();
+ for (int i = 0; i < atts.getLength(); i++) {
+ natts.addAttribute(
+ mapOldNS(atts.getURI(i)), atts.getLocalName(i),
atts.getQName(i),
+ atts.getType(i), atts.getValue(i)
+ );
+ }
+ super.startElement(mapOldNS(namespaceURI),localName,qName,atts);
+ }
+
+ @Override
+ public void endElement(String namespaceURI, String localName, String
qName) throws SAXException {
+ super.endElement(mapOldNS(namespaceURI),localName,qName);
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) throws
SAXException {
+ super.startPrefixMapping(prefix,mapOldNS(uri));
+ }
+
+ /** do not load any DTDs (may be requested by parser). Fake the DTD by
returning a empty string as InputSource */
+ @Override
+ public InputSource resolveEntity(String publicId, String systemId)
throws IOException,SAXException {
+ if (
+ "-//OpenOffice.org//DTD OfficeDocument
1.0//EN".equals(publicId) ||
+ (systemId!=null &&
systemId.toLowerCase().endsWith(".dtd"))
+ ) {
+ return new InputSource(new StringReader(""));
+ } else {
+ return super.resolveEntity(publicId,systemId);
+ }
+ }
+
+}
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java?rev=722663&r1=722662&r2=722663&view=diff
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java
(original)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java
Tue Dec 2 15:13:05 2008
@@ -16,25 +16,177 @@
*/
package org.apache.tika.parser.opendocument;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Stack;
+import java.util.HashMap;
+import java.util.Collections;
+import java.util.BitSet;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+import javax.xml.namespace.QName;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.xml.XMLParser;
-import org.apache.tika.sax.xpath.MatchingContentHandler;
-import org.apache.tika.sax.xpath.XPathParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.ElementMappingContentHandler;
+import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
+import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
+import org.xml.sax.Attributes;
+
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
/**
* Parser for OpenDocument <code>content.xml</code> files.
*/
-public class OpenOfficeContentParser extends XMLParser {
+public class OpenOfficeContentParser implements Parser {
+
+ public static final String
TEXT_NS="urn:oasis:names:tc:opendocument:xmlns:text:1.0";
+ public static final String
TABLE_NS="urn:oasis:names:tc:opendocument:xmlns:table:1.0";
+ public static final String XLINK_NS="http://www.w3.org/1999/xlink";
+
+ protected static final char[] TAB=new char[]{'\t'};
- private static final XPathParser OFFICE_XPATH = new XPathParser(
- "office", "urn:oasis:names:tc:opendocument:xmlns:office:1.0");
+ /**
+ * Mappings between OpenDocument tag names and XHTML tag names
(including attributes).
+ * All other tag names/attributes are ignored and left out from event
stream.
+ */
+ private static final HashMap<QName,TargetElement> MAPPINGS=new
HashMap<QName,TargetElement>();
+ static {
+ // general mappings of text:-tags
+ MAPPINGS.put(new QName(TEXT_NS,"p"), new
TargetElement(XHTML,"p"));
+ // text:h-tags are mapped specifically in
startElement/endElement
+ MAPPINGS.put(new QName(TEXT_NS,"line-break"), new
TargetElement(XHTML,"br"));
+ MAPPINGS.put(new QName(TEXT_NS,"list"), new
TargetElement(XHTML,"ul"));
+ MAPPINGS.put(new QName(TEXT_NS,"list-item"), new
TargetElement(XHTML,"li"));
+ MAPPINGS.put(new QName(TEXT_NS,"note"), new
TargetElement(XHTML,"div"));
+ MAPPINGS.put(new QName(TEXT_NS,"span"), new
TargetElement(XHTML,"span"));
+ MAPPINGS.put(new QName(TEXT_NS,"a"),new TargetElement(XHTML,"a",
+ Collections.singletonMap(new QName(XLINK_NS,"href"),
new QName("href"))
+ ));
+
+ // create HTML tables from table:-tags
+ MAPPINGS.put(new QName(TABLE_NS,"table"), new
TargetElement(XHTML,"table"));
+ // repeating of rows is ignored; for columns, see below!
+ MAPPINGS.put(new QName(TABLE_NS,"table-row"), new
TargetElement(XHTML,"tr"));
+ // special mapping for rowspan/colspan attributes
+ final HashMap<QName,QName> tableCellAttsMapping=new
HashMap<QName,QName>();
+ tableCellAttsMapping.put(new
QName(TABLE_NS,"number-columns-spanned"),new QName("colspan"));
+ tableCellAttsMapping.put(new
QName(TABLE_NS,"number-rows-spanned"),new QName("rowspan"));
+ /* TODO: The following is not correct, the cell should be
repeated not spanned!
+ * Code generates a HTML cell, spanning all repeated columns,
to make the cell look correct.
+ * Problems may occur when both spanning and repeating is
given, which is not allowed by spec.
+ * Cell spanning instead of repeating is not a problem,
because OpenOffice uses it
+ * only for empty cells.
+ */
+ tableCellAttsMapping.put(new
QName(TABLE_NS,"number-columns-repeated"),new QName("colspan"));
+ MAPPINGS.put(new QName(TABLE_NS,"table-cell"), new
TargetElement(XHTML,"td",tableCellAttsMapping));
+ }
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata
metadata)
+ throws IOException, SAXException, TikaException {
- protected DefaultHandler getDefaultHandler(ContentHandler ch, Metadata md)
{
- return new MatchingContentHandler(
- super.getDefaultHandler(ch, md),
- OFFICE_XPATH.parse("//office:body//text()"));
- }
+ final XHTMLContentHandler xhtml = new
XHTMLContentHandler(handler,metadata);
+ final DefaultHandler dh = new
ElementMappingContentHandler(xhtml, MAPPINGS) {
+ private final BitSet textNodeStack=new BitSet();
+ private int nodeDepth=0,completelyFiltered=0;
+ private Stack<String> headingStack=new Stack<String>();
+
+ @Override
+ public void characters(char[] ch, int start, int
length) throws SAXException {
+ // only forward content of tags from
text:-namespace
+ if (completelyFiltered==0 && nodeDepth>0 &&
textNodeStack.get(nodeDepth-1))
+ super.characters(ch,start,length);
+ }
+
+ // helper for checking tags which need complete
filtering (with sub-tags)
+ private final boolean needsCompleteFiltering(String
namespaceURI, String localName) {
+ return (
+ (TEXT_NS.equals(namespaceURI) &&
(localName.endsWith("-template") || localName.endsWith("-style"))) ||
+ (TABLE_NS.equals(namespaceURI) &&
"covered-table-cell".equals(localName))
+ );
+ }
+
+ // map the heading level to <hX> HTML tags
+ private final String getXHTMLHeaderTagName(Attributes
atts) {
+ final String
depthStr=atts.getValue(TEXT_NS,"outline-level");
+ if (depthStr==null) return "h1";
+ int depth=Integer.parseInt(depthStr);
+ if (depth>6) depth=6;
+ if (depth<1) depth=1;
+ return "h"+depth;
+ }
+
+ @Override
+ public void startElement(String namespaceURI, String
localName, String qName, Attributes atts) throws SAXException {
+ // keep track of current node type. If it is a
text node, a bit at the current depth ist set in textNodeStack.
+ // characters() checks the top bit to
determine, if the actual node is a text node to print out
+ // nodeDepth contains the depth of the current
node and also marks top of stack.
+ assert nodeDepth>=0;
+ textNodeStack.set(nodeDepth++,
TEXT_NS.equals(namespaceURI));
+ // filter *all* content of some tags
+ assert completelyFiltered>=0;
+ if
(needsCompleteFiltering(namespaceURI,localName)) completelyFiltered++;
+ // call next handler if no filtering
+ if (completelyFiltered==0) {
+ // special handling of text:h, that are
directly passed to xhtml handler
+ if (TEXT_NS.equals(namespaceURI) &&
"h".equals(localName)) {
+
xhtml.startElement(headingStack.push(getXHTMLHeaderTagName(atts)));
+ } else {
+
super.startElement(namespaceURI,localName,qName,atts);
+ }
+ }
+ }
+
+ @Override
+ public void endElement(String namespaceURI, String
localName, String qName) throws SAXException {
+ // call next handler if no filtering
+ if (completelyFiltered==0) {
+ // special handling of text:h, that are
directly passed to xhtml handler
+ if (TEXT_NS.equals(namespaceURI) &&
"h".equals(localName)) {
+
xhtml.endElement(headingStack.pop());
+ } else {
+
super.endElement(namespaceURI,localName,qName);
+ }
+ // special handling of tabulators
+ if (TEXT_NS.equals(namespaceURI) &&
("tab-stop".equals(localName) || "tab".equals(localName)))
+
this.characters(TAB,0,TAB.length);
+ }
+ // revert filter for *all* content of some tags
+ if
(needsCompleteFiltering(namespaceURI,localName)) completelyFiltered--;
+ assert completelyFiltered>=0;
+ // reduce current node depth
+ nodeDepth--;
+ assert nodeDepth>=0;
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String
uri) throws SAXException {
+ // remove prefix mappings as they should not
occur in XHTML
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) throws
SAXException {
+ // remove prefix mappings as they should not
occur in XHTML
+ }
+
+ };
+
+ try {
+ SAXParserFactory factory =
SAXParserFactory.newInstance();
+ factory.setValidating(false);
+ factory.setNamespaceAware(true);
+ SAXParser parser = factory.newSAXParser();
+ parser.parse(new CloseShieldInputStream(stream),new
NSNormalizerContentHandler(dh));
+ } catch (ParserConfigurationException e) {
+ throw new TikaException("XML parser configuration
error", e);
+ }
+ }
}
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java?rev=722663&r1=722662&r2=722663&view=diff
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java
(original)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java
Tue Dec 2 15:13:05 2008
@@ -65,6 +65,7 @@
dh = getStatistic(dh, md, "nbPara", "paragraph-count");
dh = getStatistic(dh, md, "nbWord", "word-count");
dh = getStatistic(dh, md, "nbCharacter", "character-count");
+ dh = new NSNormalizerContentHandler(dh);
return dh;
}
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java?rev=722663&view=auto
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java
(added)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java
Tue Dec 2 15:13:05 2008
@@ -0,0 +1,115 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.util.Map;
+import java.util.Collections;
+import javax.xml.namespace.QName;
+
+import org.xml.sax.SAXException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that maps element <code>QName</code>s using a
<code>Map</code>.
+ * Not mappable elements are not forwarded.
+ * Attributes may also be mapped (for each element different using a
<code>Map</code> for attributes),
+ * not mappable attributes are not forwarded.
+ * The default is to not map any attributes and therefore do not forward any
of them.
+ */
+public class ElementMappingContentHandler extends ContentHandlerDecorator {
+
+ private final Map<QName,TargetElement> mappings;
+
+ public ElementMappingContentHandler(ContentHandler handler,
Map<QName,TargetElement> mappings) {
+ super(handler);
+ this.mappings=mappings;
+ }
+
+ @Override
+ public void startElement(String namespaceURI, String localName, String
qName, Attributes atts) throws SAXException {
+ final TargetElement mapping=mappings.get(new
QName(namespaceURI,localName));
+ if (mapping!=null) {
+ final QName tag=mapping.getMappedTagName();
+
super.startElement(tag.getNamespaceURI(),tag.getLocalPart(),getQNameAsString(tag),mapping.mapAttributes(atts));
+ }
+ }
+
+ @Override
+ public void endElement(String namespaceURI, String localName, String
qName) throws SAXException {
+ final TargetElement mapping=mappings.get(new
QName(namespaceURI,localName));
+ if (mapping!=null) {
+ final QName tag=mapping.getMappedTagName();
+
super.endElement(tag.getNamespaceURI(),tag.getLocalPart(),getQNameAsString(tag));
+ }
+ }
+
+ protected static final String getQNameAsString(final QName qname) {
+ final StringBuilder qn=new StringBuilder(qname.getPrefix());
+ if (qn.length()>0) qn.append(':');
+ return qn.append(qname.getLocalPart()).toString();
+ }
+
+ public static class TargetElement {
+
+ /** Creates an TargetElement, attributes of this element will
be mapped as specified */
+ public TargetElement(QName mappedTagName, Map<QName,QName>
attributesMapping) {
+ this.mappedTagName=mappedTagName;
+ this.attributesMapping=attributesMapping;
+ }
+
+ /** A shortcut that automatically creates the QName object */
+ public TargetElement(String mappedTagURI, String
mappedTagLocalName, Map<QName,QName> attributesMapping) {
+ this(new QName(mappedTagURI,mappedTagLocalName),
attributesMapping);
+ }
+
+ /** Creates an TargetElement with no attributes, all attributes
will be deleted from SAX stream */
+ public TargetElement(QName mappedTagName) {
+ this(mappedTagName,
Collections.<QName,QName>emptyMap());
+ }
+
+ /** A shortcut that automatically creates the QName object */
+ public TargetElement(String mappedTagURI, String
mappedTagLocalName) {
+ this(mappedTagURI, mappedTagLocalName,
Collections.<QName,QName>emptyMap());
+ }
+
+ public QName getMappedTagName() {
+ return mappedTagName;
+ }
+
+ public Map<QName,QName> getAttributesMapping() {
+ return attributesMapping;
+ }
+
+ public Attributes mapAttributes(final Attributes atts) {
+ final AttributesImpl natts = new AttributesImpl();
+ for (int i = 0; i < atts.getLength(); i++) {
+ QName name=attributesMapping.get(new
QName(atts.getURI(i), atts.getLocalName(i)));
+ if (name!=null) natts.addAttribute(
+ name.getNamespaceURI(),
name.getLocalPart(), getQNameAsString(name),
+ atts.getType(i), atts.getValue(i)
+ );
+ }
+ return natts;
+ }
+
+ private final QName mappedTagName;
+ private final Map<QName,QName> attributesMapping;
+ }
+
+}
Modified: lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=722663&r1=722662&r2=722663&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Tue Dec 2
15:13:05 2008
@@ -387,6 +387,20 @@
</magic>
</mime-type>
+ <mime-type type="application/vnd.sun.xml.writer">
+ <comment>
+ OpenOffice v1.0: Writer Document
+ </comment>
+ <alias type="application/x-vnd.sun.xml.writer" />
+ <glob pattern="*.sxw" />
+ <magic>
+ <match type="string" offset="0" value="PK">
+ <match type="string" offset="30"
+ value="mimetypeapplication/vnd.sun.xml.writer" />
+ </match>
+ </magic>
+ </mime-type>
+
<mime-type type="application/zip">
<alias type="application/x-zip-compressed" />
<magic priority="40">